In [1]:
import logging
from datetime import datetime, timedelta
from tqdm import tqdm
from scraper import collect_tweets_for_day, get_last_collected_date

In [2]:
import logging
from datetime import datetime, timedelta
from tqdm import tqdm
from scraper import scrape_tweets_parallel, get_last_collected_date

# Configuration
START_DATE = datetime(2022, 1, 1)  # Start date of the range
END_DATE = datetime(2023, 12, 31)  # End date of the range
STOCKS = ["ITC.NS", "SBIN.NS", "KOTAKBANK.NS", "BHARTIARTL.NS", "HCLTECH.NS", "LT.NS", "AXISBANK.NS",
          "ASIANPAINT.NS", "BAJFINANCE.NS", "MARUTI.NS", "M&M.NS", "SUNPHARMA.NS", "TITAN.NS", "ULTRACEMCO.NS",
          "NESTLEIND.NS", "INDUSINDBK.NS", "ADANIENT.NS", "POWERGRID.NS", "NTPC.NS", "TATASTEEL.NS", "JSWSTEEL.NS",
          "ONGC.NS", "BAJAJFINSV.NS", "DIVISLAB.NS", "TECHM.NS", "WIPRO.NS", "GRASIM.NS", "BRITANNIA.NS", 
          "CIPLA.NS", "ADANIGREEN.NS", "ADANIPORTS.NS", "HEROMOTOCO.NS", "COALINDIA.NS", "BPCL.NS", 
          "APOLLOHOSP.NS", "TATAMOTORS.NS", "HDFCLIFE.NS", "DABUR.NS", "BAJAJ-AUTO.NS", "DRREDDY.NS", 
          "SBILIFE.NS", "EICHERMOT.NS", "HINDALCO.NS", "ICICIGI.NS"]

QUERY_BASE_TEMPLATE = '{} OR Stock OR Market OR #StockMarket OR #MarketAnalysis'  # Base query

# Setup logging
logging.basicConfig(filename='stock_tweet_collection.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

for stock in STOCKS:
    CSV_FILE = f'{stock.replace(".NS", "")}.csv'  # Create a unique CSV file for each stock
    QUERY_BASE = QUERY_BASE_TEMPLATE.format(stock)
    
    # Get the starting date
    start_date = get_last_collected_date(CSV_FILE, START_DATE)

    # Check if there's anything left to collect
    if start_date > END_DATE:
        print(f'All data for {stock} has already been collected up to {END_DATE.strftime("%Y-%m-%d")}.')
        continue

    # Parallelize the collection of tweets for the specified date range
    print(f'Starting tweet collection for {stock}...')
    scrape_tweets_parallel(QUERY_BASE, start_date, END_DATE, CSV_FILE)

    print(f'Finished collecting tweets for {stock}.')


All data for ITC.NS has already been collected up to 2023-12-31.
All data for SBIN.NS has already been collected up to 2023-12-31.
All data for KOTAKBANK.NS has already been collected up to 2023-12-31.
All data for BHARTIARTL.NS has already been collected up to 2023-12-31.
All data for HCLTECH.NS has already been collected up to 2023-12-31.
All data for LT.NS has already been collected up to 2023-12-31.
All data for AXISBANK.NS has already been collected up to 2023-12-31.
All data for ASIANPAINT.NS has already been collected up to 2023-12-31.
All data for BAJFINANCE.NS has already been collected up to 2023-12-31.
All data for MARUTI.NS has already been collected up to 2023-12-31.
All data for M&M.NS has already been collected up to 2023-12-31.
All data for SUNPHARMA.NS has already been collected up to 2023-12-31.
All data for TITAN.NS has already been collected up to 2023-12-31.
All data for ULTRACEMCO.NS has already been collected up to 2023-12-31.
All data for NESTLEIND.NS has alread

Collecting Tweets: 100%|█████████████████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.84s/it]


Finished collecting tweets for HDFCLIFE.NS.
Starting tweet collection for DABUR.NS...


Collecting Tweets: 100%|█████████████████████████████████████████████████████████████| 278/278 [29:09<00:00,  6.29s/it]


Finished collecting tweets for DABUR.NS.
Starting tweet collection for BAJAJ-AUTO.NS...


Collecting Tweets: 100%|█████████████████████████████████████████████████████████████| 730/730 [51:49<00:00,  4.26s/it]


Finished collecting tweets for BAJAJ-AUTO.NS.
Starting tweet collection for DRREDDY.NS...


Collecting Tweets: 100%|███████████████████████████████████████████████████████████| 730/730 [1:02:22<00:00,  5.13s/it]


Finished collecting tweets for DRREDDY.NS.
Starting tweet collection for SBILIFE.NS...


Collecting Tweets: 100%|███████████████████████████████████████████████████████████| 730/730 [1:02:44<00:00,  5.16s/it]


Finished collecting tweets for SBILIFE.NS.
Starting tweet collection for EICHERMOT.NS...


Collecting Tweets: 100%|█████████████████████████████████████████████████████████████| 730/730 [53:02<00:00,  4.36s/it]


Finished collecting tweets for EICHERMOT.NS.
Starting tweet collection for HINDALCO.NS...


Collecting Tweets: 100%|███████████████████████████████████████████████████████████| 730/730 [1:02:34<00:00,  5.14s/it]


Finished collecting tweets for HINDALCO.NS.
Starting tweet collection for ICICIGI.NS...


Collecting Tweets: 100%|███████████████████████████████████████████████████████████| 730/730 [1:02:41<00:00,  5.15s/it]

Finished collecting tweets for ICICIGI.NS.





# Clean

In [6]:
import pandas as pd
import os

# Function to clean data by removing rows where column names appear in rows
def clean_data(file_path):
    try:
        # Load the CSV file
        df = pd.read_csv(file_path)
        
        # Find rows where the values match the column names (e.g., duplicates of column headers in the data)
        header_row_mask = (df == df.columns).all(axis=1)
        
        # If such rows are found, remove them
        if header_row_mask.any():
            print(f"Cleaning file: {file_path}")
            df_cleaned = df[~header_row_mask]  # Keep rows where the header doesn't match
            
            # Overwrite the original file with the cleaned data
            df_cleaned.to_csv(file_path, index=False)
            print(f"File {file_path} cleaned and overwritten.")
        else:
            print(f"No issues found in {file_path}.")
    
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

# List of stock symbols
STOCKS = ["ITC", "SBIN", "KOTAKBANK", "BHARTIARTL", "HCLTECH", "LT", "AXISBANK",
          "ASIANPAINT", "BAJFINANCE", "MARUTI", "M&M", "SUNPHARMA", "TITAN", 
          "ULTRACEMCO", "NESTLEIND", "INDUSINDBK", "ADANIENT", "POWERGRID", 
          "NTPC", "TATASTEEL", "JSWSTEEL", "ONGC", "BAJAJFINSV", "DIVISLAB", 
          "TECHM", "WIPRO", "GRASIM", "BRITANNIA", "CIPLA", "ADANIGREEN", 
          "ADANIPORTS", "HEROMOTOCO", "COALINDIA", "BPCL", "APOLLOHOSP", 
          "TATAMOTORS", "HDFCLIFE", "DABUR", "BAJAJ-AUTO", "DRREDDY", 
          "SBILIFE", "EICHERMOT", "HINDALCO", "ICICIGI"]

# Loop through each stock symbol and clean the corresponding CSV file
for stock in STOCKS:
    file_name = f"{stock}.csv"
    
    # Check if the file exists
    if os.path.exists(file_name):
        clean_data(file_name)
    else:
        print(f"File {file_name} does not exist.")


No issues found in ITC.csv.
No issues found in SBIN.csv.
No issues found in KOTAKBANK.csv.
No issues found in BHARTIARTL.csv.
No issues found in HCLTECH.csv.
No issues found in LT.csv.
No issues found in AXISBANK.csv.
No issues found in ASIANPAINT.csv.
No issues found in BAJFINANCE.csv.
No issues found in MARUTI.csv.
No issues found in M&M.csv.
No issues found in SUNPHARMA.csv.
No issues found in TITAN.csv.
No issues found in ULTRACEMCO.csv.
No issues found in NESTLEIND.csv.
No issues found in INDUSINDBK.csv.
No issues found in ADANIENT.csv.
No issues found in POWERGRID.csv.
No issues found in NTPC.csv.
No issues found in TATASTEEL.csv.
No issues found in JSWSTEEL.csv.
No issues found in ONGC.csv.
No issues found in BAJAJFINSV.csv.
No issues found in DIVISLAB.csv.
No issues found in TECHM.csv.
No issues found in WIPRO.csv.
No issues found in GRASIM.csv.
No issues found in BRITANNIA.csv.
No issues found in CIPLA.csv.
No issues found in ADANIGREEN.csv.
No issues found in ADANIPORTS.csv.
