<a href="https://colab.research.google.com/github/DumbMachine/ScrapingData/blob/master/STock.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import logging
import os
import time
from datetime import datetime
from multiprocessing.pool import ThreadPool

import numpy as np
import pandas as pd
import requests
from datetime import datetime

from bs4 import BeautifulSoup

# Setting the Logger
if not (os.path.exists(os.getcwd()+"/logging")):
    os.mkdir(os.getcwd()+"/logging/")
logging.basicConfig(filename='./logging/something.log', filemode='w',
                    format='%(asctime)s - %(message)s', level=logging.INFO)
logging.info('LOGGER inintialized')


class MultiStock:
    def __init__(self, stock_name, urls, instance_time=False, frame=False, threads=None, full_day=False, time_period=0, logger=False, from_search=True):
        '''
        Initializing the required vbariables and settings
        options = {
            'threads': None,
             'time_period': 1,
             'full_day'; False,
             'logging': True
            more to added....
                    }
        '''
        self.stock_name = stock_name
        self.urls = urls
        self.threads = threads if threads else 1
        self.instance_time  = datetime.now()
        self.time_period = time_period
        if full_day:
            self._time_period = 0
            self.full_day = full_day
        self.instance_time = datetime.now()
        self.frame = {}
        if not frame:
            for url in self.urls:
                self.frame[url] = pd.DataFrame()

        # PATHS for extracting the Stock information
        self._VOLUME = "#bse_volume > strong"
        self._PRICE = "#Bse_Prc_tick > strong"
        self._PERCENTAGE = "#b_changetext > span > strong"
        self._PREV_CLOSE = "#b_prevclose > strong"
        self._OPEN_PRICE = "#b_open > strong"
        self._MARKET_CLOSE = "12:00"
        logging.info("VARABLES initialized")


    def get_data(self):
        lol = lambda lst, sz: [lst[i:i+sz] for i in range(0, len(lst), sz)]
        logging.info("Cycle Started!!!")
        pool = ThreadPool(self.threads)
        # urls = [[url,True] for url in self.urls]
        finale = pool.map(self.update, lol(self.urls, self.threads)[0])
        logging.info("Cycle Completed!!!\n")

    def update(self, link, threaded=False):
        headers = {
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'}
        try:
            req = requests.get(link, headers)
            soup = BeautifulSoup(req.text, 'lxml')
            self.frame[link] = pd.concat([self.frame[link], pd.DataFrame({"time": (str(datetime.now())[:10]+" "+str(datetime.now()-self.instance_time)), "volume": str(soup.select(self._VOLUME)[0]).strip("<strong>").strip("</strong>"),
                                                                        "price": str(soup.select(self._PRICE)[0]).strip("<strong>").strip("</strong>"),
                                                                        "percentage": str(soup.select(self._PERCENTAGE)[0]).strip("<strong>").strip("</strong>"),
                                                                        "_PREV_CLOSE": str(soup.select(self._PREV_CLOSE)[0]).strip("<strong>").strip("</strong>"),
                                                                        "_OPEN_PRICE": str(soup.select(self._OPEN_PRICE)[0]).strip("<strong>").strip("</strong>")
                                                                        }, index=[0])]) # Use APPEND
            
            if not os.path.exists(os.getcwd()+"/data"):
                os.mkdir(os.getcwd()+'/data')
            self.frame[link].to_csv(
                "./data/{}.csv".format(link.split('/')[-2]))
            logging.info(f"SUCCESSFULLY grabbed {link.split('/')[-2]} Stock")
        except Exception as e:
            logging.error("Exception occurred", exc_info=True)


    # TODO: Instead of using get_data use stream and set the interval to one for the minute
    def stream(self):
        if (self.time_period):
            '''
            time_period = time_period min
            Run the stream for time_period minutes
            '''
            starttime = time.time()
            minute = 0
            while(minute < self.time_period):
                for url in self.urls:
                    self.update(url)
                minute += 1
                time.sleep(60.0)
        else:
            print(1)
            '''
                full_day: True
                stream for the whole trade day
            '''
            while(str(datetime.now().hour)+":"+str(datetime.now().minute) < self._MARKET_CLOSE):
                print("Makret is open")

    def get_stats(self):
        '''
        import matplotlib.pyplot as plt
        plt.plot()
        plt.scatter()
        '''
        # Thsi is the placeholder ffunction where the implementation for statistics will take place.
        pass

    def custom_columns(self):
        # This is the holder function where CUSTOM_COLUMNS will be implemented
        pass


In [0]:
list = [
    "https://www.moneycontrol.com/india/stockpricequote/steelmediumsmall/ramsarupindustries/RI36"
    "https://www.moneycontrol.com/india/stockpricequote/fertilisers/agritechindia/ATI02",
    "https://www.moneycontrol.com/india/stockpricequote///TAR",
    "https://www.moneycontrol.com/india/stockpricequote/computerssoftwaremediumsmall/curatechnologies/SS48",
    "https://www.moneycontrol.com/india/stockpricequote/castingsforgings/lgbforge/LGB01",
    "https://www.moneycontrol.com/india/stockpricequote/computerssoftwaremediumsmall/ushamartineducationsolutions/UMI02",
    "https://www.moneycontrol.com/india/stockpricequote/pharmaceuticals/parabolicdrugs/PD06",
    "https://www.moneycontrol.com/india/stockpricequote/computers-software/63moonstechnologies/FT02",
    "http://www.moneycontrol.com/india/stockpricequote/fertilisers/oswalchemicalsfertilisers/OCF",
    "http://www.moneycontrol.com/india/stockpricequote/chemicals/phillipscarbonblack/PCB01",
    "http://www.moneycontrol.com/india/stockpricequote/chemicals/punjabchemicalscropprotection/PCC03",
    "http://www.moneycontrol.com/india/stockpricequote/pharmaceuticals/parenteraldrugsindia/PDI01",
    "http://www.moneycontrol.com/india/stockpricequote/finance-investments/sovereigngoldbonds250may2025sri201718/SGB09",
    "http://www.moneycontrol.com/india/stockpricequote/miscellaneous/adanigaslimited/ADG01",
    "http://www.moneycontrol.com/india/stockpricequote/pharmaceuticals/astrazenecapharma/AZP",
    "http://www.moneycontrol.com/india/stockpricequote/engineering/electrothermindia/EI02",
    "http://www.moneycontrol.com/india/stockpricequote/miscellaneous/railvikasnigam/RVN",
    "http://www.moneycontrol.com/india/stockpricequote/diamond-cutting-jewellery-precious-metals/pcjeweller/PJ",
    "http://www.moneycontrol.com/india/stockpricequote/computers-software/63moonstechnologies/FT02",
    "http://www.moneycontrol.com/india/stockpricequote/transport-logistics/globalvectrahelicorp/GVH",
    "https://www.moneycontrol.com/india/stockpricequote/plantationsteacoffee/dhunseriteaindustries/DTI03",
    "https://www.moneycontrol.com/india/stockpricequote/pharmaceuticals/kilitchdrugsindia/KDI",
    "https://www.moneycontrol.com/india/stockpricequote/pharmaceuticals/shilpamedicare/SM19",
    "https://www.moneycontrol.com/india/stockpricequote/plastics/cosmofilms/CF08",
    "https://www.moneycontrol.com/india/stockpricequote/fertilisers/agritechindia/ATI02",
    "https://www.moneycontrol.com/india/stockpricequote///TAR",
    "https://www.moneycontrol.com/india/stockpricequote/transportlogistics/interglobeaviation/IA04",
    "https://www.moneycontrol.com/india/stockpricequote/autoancillaries/mindaindustries/MI4",
    "https://www.moneycontrol.com/india/stockpricequote/bankspublicsector/bankmaharashtra/BM05",
    "https://www.moneycontrol.com/india/stockpricequote/telecommunicationsservice/vodafoneidealimited/IC8",
    "https://www.moneycontrol.com/india/stockpricequote/fertilisers/deepakfertilizerspetrochemicalscoprn/DFP",
    "https://www.moneycontrol.com/india/stockpricequote/computerssoftware/8kmilessoftwareservices/PMS01",
    "https://www.moneycontrol.com/india/stockpricequote///MI60",
    
]

In [0]:
# %%timeit
something  = MultiStock("MY-GAme",list,threads=5,time_period = 10)
something.get_data()

In [0]:
%%timeit
something.stream()

1
1
1
1
1
1


In [0]:
%%timeit
something  = MultiStock("MY-GAme",list,threads=len(list),time_period = 50)
something.get_data()

1 loop, best of 3: 1.04 s per loop


In [0]:
%%timeit
something.stream()

1 loop, best of 3: 1min 4s per loop


In [0]:
%timeit
import os
os.listdir('./data/')

['goenkadiamondjewels.csv',
 'agritechindia.csv',
 'jindalhisar.csv',
 'ramsarupindustries.csv',
 'tataconsultancyservices.csv',
 '.ipynb_checkpoints']

In [0]:
import pandas as pd
import numpy as np
def data_reader():
    frame1 = pd.read_csv("./data/jindalhisar.csv",index_col=[0],names=['time', 'volume', 'price', 'percentage', '_PREV_CLOSE', '_OPEN_PRICE'])
    frame2 = pd.read_csv("./data/tataconsultancyservices.csv",index_col=[0])
    frame1.reset_index().drop('index',axis=1).head()
    return 
   

data_reader()

In [0]:
frame1 = pd.read_csv("./data/jindalhisar.csv",index_col=[0])
frame2 = pd.read_csv("./data/tataconsultancyservices.csv",index_col=[0])
frame1 = frame1.reset_index().drop('index',axis=1)
frame1.head()
frame2 = frame2.reset_index().drop('index',axis=1)
frame2.head()

Unnamed: 0,time,volume,price,percentage,_PREV_CLOSE,_OPEN_PRICE
0,2019-04-15 0:00:44.382739,156916,2073.75,60.0,2013.75,2071.0
1,2019-04-15 0:00:44.996041,156916,2073.75,60.0,2013.75,2071.0
2,2019-04-15 0:00:45.543352,156916,2073.75,60.0,2013.75,2071.0
3,2019-04-15 0:00:46.079569,156916,2073.75,60.0,2013.75,2071.0
4,2019-04-15 0:00:46.500610,156916,2073.75,60.0,2013.75,2071.0


In [0]:
frame1.head()

Unnamed: 0,time,volume,price,percentage,_PREV_CLOSE,_OPEN_PRICE
0,2019-04-15 0:00:00.212732,11281,90.2,0.4,89.8,90.1
1,2019-04-15 0:00:43.888601,11281,90.2,0.4,89.8,90.1
2,2019-04-15 0:00:44.794806,11281,90.2,0.4,89.8,90.1
3,2019-04-15 0:00:45.339110,11281,90.2,0.4,89.8,90.1
4,2019-04-15 0:00:45.758728,11281,90.2,0.4,89.8,90.1


In [0]:
frame2.head()

Unnamed: 0,time,volume,price,percentage,_PREV_CLOSE,_OPEN_PRICE
0,2019-04-15 05:13:31.597227,127223,2069.4,55.65,2013.75,2071.0
1,2019-04-15 05:13:32.156615,127223,2069.4,55.65,2013.75,2071.0
2,2019-04-15 05:13:32.827733,127223,2069.4,55.65,2013.75,2071.0
3,2019-04-15 05:13:33.485283,127223,2069.4,55.65,2013.75,2071.0
4,2019-04-15 05:13:34.165155,127223,2069.4,55.65,2013.75,2071.0


In [0]:
frame1.join(frame2, lsuffix='_jindal', rsuffix='_tcs').drop('time_jindal',axis=1)

Unnamed: 0,volume_jindal,price_jindal,percentage_jindal,_PREV_CLOSE_jindal,_OPEN_PRICE_jindal,time_tcs,volume_tcs,price_tcs,percentage_tcs,_PREV_CLOSE_tcs,_OPEN_PRICE_tcs
0,5133,90.7,0.9,89.8,90.1,2019-04-15 05:32:33.969982,135767.0,2073.6,59.85,2013.75,2071.0
1,5133,90.7,0.9,89.8,90.1,2019-04-15 05:32:34.647851,135767.0,2073.6,59.85,2013.75,2071.0
2,5133,90.7,0.9,89.8,90.1,2019-04-15 05:32:35.438002,135767.0,2073.6,59.85,2013.75,2071.0
3,5133,90.7,0.9,89.8,90.1,2019-04-15 05:32:36.000059,135767.0,2073.6,59.85,2013.75,2071.0
4,5133,90.7,0.9,89.8,90.1,2019-04-15 05:32:36.664171,135767.0,2073.6,59.85,2013.75,2071.0
5,5133,90.7,0.9,89.8,90.1,2019-04-15 05:32:37.233364,135767.0,2073.6,59.85,2013.75,2071.0
6,5133,90.7,0.9,89.8,90.1,2019-04-15 05:32:37.899240,135767.0,2073.6,59.85,2013.75,2071.0
7,5133,90.7,0.9,89.8,90.1,2019-04-15 05:32:38.507600,135767.0,2073.6,59.85,2013.75,2071.0
8,5133,90.7,0.9,89.8,90.1,2019-04-15 05:32:39.174174,135767.0,2073.6,59.85,2013.75,2071.0
9,5133,90.7,0.9,89.8,90.1,2019-04-15 05:32:39.750639,135767.0,2073.6,59.85,2013.75,2071.0


In [0]:
main_df = pd.DataFrame()
ratios = ["tataconsultancyservices.csv", "jindalhisar.csv"]
for ratio in ratios:
    print("\n",ratio,"\n")
    df = pd.read_csv(f"./data/{ratio}", index_col=[0])
    df.rename(columns={"_PREV_CLOSE": f"{ratio}_close", "volume": f"{ratio}_volume"}, inplace=True)
    df.set_index("time", inplace=True)  # set time as index so we can join them on this shared time
    df = df[[f"{ratio}_close", f"{ratio}_volume"]]  # ignore the other columns besides price and volume
    print("HEAD:\n",df.head())
    if len(main_df)==0:  # if the dataframe is empty
            main_df = df  # then it's just the current df
    else:  # otherwise, join this data to the main one
            main_df = main_df.join(df)


 tataconsultancyservices.csv 

HEAD:
                             tataconsultancyservices.csv_close  \
time                                                            
2019-04-15 04:35:03.704200                            2013.75   
2019-04-15 04:35:04.291598                            2013.75   
2019-04-15 04:35:04.752160                            2013.75   
2019-04-15 04:35:05.315292                            2013.75   
2019-04-15 04:35:05.768864                            2013.75   

                           tataconsultancyservices.csv_volume  
time                                                           
2019-04-15 04:35:03.704200                            113,358  
2019-04-15 04:35:04.291598                            113,358  
2019-04-15 04:35:04.752160                            113,358  
2019-04-15 04:35:05.315292                            113,358  
2019-04-15 04:35:05.768864                            113,358  

 jindalhisar.csv 

HEAD:
                             ji

In [0]:
main_df

Unnamed: 0_level_0,tataconsultancyservices.csv_close,tataconsultancyservices.csv_volume,jindalhisar.csv_close,jindalhisar.csv_volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-04-15 04:35:03.704200,2013.75,113358,,
2019-04-15 04:35:04.291598,2013.75,113358,,
2019-04-15 04:35:04.752160,2013.75,113358,,
2019-04-15 04:35:05.315292,2013.75,113358,,
2019-04-15 04:35:05.768864,2013.75,113358,,
2019-04-15 04:35:06.344974,2013.75,113358,,
2019-04-15 04:35:06.789749,2013.75,113358,,
2019-04-15 04:35:07.349881,2013.75,113358,,
2019-04-15 04:35:07.873998,2013.75,113358,,
2019-04-15 04:35:08.308430,2013.75,113358,,


KeyError: ignored