#Import Libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#Mounting the Drive
**To read the file (Dataset)**

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
path = '/content/drive/My Drive/india-news-headlines.csv' #dataset file path

#**Data Collection**

##**News Data**

In [4]:
news_df = pd.read_csv(path)

In [5]:
news_df.head(5)

Unnamed: 0,publish_date,headline_category,headline_text
0,20010102,unknown,Status quo will not be disturbed at Ayodhya; s...
1,20010102,unknown,Fissures in Hurriyat over Pak visit
2,20010102,unknown,America's unwanted heading for India?
3,20010102,unknown,For bigwigs; it is destination Goa
4,20010102,unknown,Extra buses to clear tourist traffic


**Rename the Columns**

In [6]:
new_columns = ['Date', 'Category', 'News']
news_df.columns = new_columns

In [7]:
news_df.head(5)

Unnamed: 0,Date,Category,News
0,20010102,unknown,Status quo will not be disturbed at Ayodhya; s...
1,20010102,unknown,Fissures in Hurriyat over Pak visit
2,20010102,unknown,America's unwanted heading for India?
3,20010102,unknown,For bigwigs; it is destination Goa
4,20010102,unknown,Extra buses to clear tourist traffic


##Columns

In [8]:
news_df.columns

Index(['Date', 'Category', 'News'], dtype='object')

**Let the drop the unnecessary column**

In [9]:
news_df.drop('Category', axis=1, inplace=True)

#**Concatenate News by Date**

In [10]:
#Concatenate the news of the same dates. It means that rows with the same date will be considered as a single group.
news_df['News'] = news_df.groupby(['Date']).transform(lambda x : ' '.join(x))

##Duplicate Record

In [11]:
# Check for duplicated dates
print(f'The Number of Duplicate Dates Are: {news_df.duplicated().sum()}')

The Number of Duplicate Dates Are: 3868387


**Let drop the dupliacte records.**

In [12]:
# Drop duplicates, keeping the first occurrence
news_df.drop_duplicates(inplace=True)
news_df.reset_index(inplace=True,drop=True)

In [13]:
print(f'Now, The Number of Duplicate Dates Are: {news_df.duplicated().sum()}')

Now, The Number of Duplicate Dates Are: 0


In [14]:
news_df.head(3)

Unnamed: 0,Date,News
0,20010102,Status quo will not be disturbed at Ayodhya; s...
1,20010103,Powerless north India gropes in the dark Think...
2,20010104,The string that pulled Stephen Hawking to Indi...


**Data Shape**

In [15]:
news_df.shape

(8170, 2)

##**News Data Features Overview**

In [16]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8170 entries, 0 to 8169
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Date    8170 non-null   int64 
 1   News    8170 non-null   object
dtypes: int64(1), object(1)
memory usage: 127.8+ KB


**Since, the date is not in date-time format.**

**So, let change the date datatype**

In [17]:
from datetime import datetime

In [18]:
news_df['Date'] = pd.to_datetime(news_df['Date'], format='%Y%m%d')

In [19]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8170 entries, 0 to 8169
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    8170 non-null   datetime64[ns]
 1   News    8170 non-null   object        
dtypes: datetime64[ns](1), object(1)
memory usage: 127.8+ KB


In [20]:
news_df.head(3)

Unnamed: 0,Date,News
0,2001-01-02,Status quo will not be disturbed at Ayodhya; s...
1,2001-01-03,Powerless north India gropes in the dark Think...
2,2001-01-04,The string that pulled Stephen Hawking to Indi...


In [21]:
news_df.describe(datetime_is_numeric=True)

Unnamed: 0,Date
count,8170
mean,2012-04-20 05:56:54.932680704
min,2001-01-02 00:00:00
25%,2006-09-17 06:00:00
50%,2012-04-20 12:00:00
75%,2017-11-22 18:00:00
max,2023-06-30 00:00:00


In [22]:
news_df.describe(include='object')

Unnamed: 0,News
count,8170
unique,8170
top,Status quo will not be disturbed at Ayodhya; s...
freq,1


In [23]:
news_df.isnull().sum()

Date    0
News    0
dtype: int64

#**Historical Stock Data**

In [24]:
import yfinance as yf

Let have the stock data of the same years to the above news data.

**Date Selection**

In [25]:
start_date = news_df['Date'].min()
start_date

Timestamp('2001-01-02 00:00:00')

In [26]:
end_date = news_df['Date'].max()
end_date

Timestamp('2023-06-30 00:00:00')

In [27]:
symbol = "AAPL"
start_date = start_date.strftime('%Y-%m-%d')
end_date = end_date.strftime('%Y-%m-%d')


**Data Downloading**

In [28]:
# Fetch the data
stock_df = yf.download(symbol, start=start_date, end=end_date)

[*********************100%%**********************]  1 of 1 completed


**AAPL Stock Data**

In [29]:
stock_df.head(5)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2001-01-02,0.265625,0.272321,0.260045,0.265625,0.225165,452312000
2001-01-03,0.258929,0.297991,0.257813,0.292411,0.247871,817073600
2001-01-04,0.32394,0.330357,0.300223,0.304688,0.258278,739396000
2001-01-05,0.302455,0.310268,0.28683,0.292411,0.247871,412356000
2001-01-08,0.302455,0.303292,0.284598,0.295759,0.250709,373699200


In [30]:
stock_df.reset_index(inplace=True)

In [31]:
stock_df.head(3)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2001-01-02,0.265625,0.272321,0.260045,0.265625,0.225165,452312000
1,2001-01-03,0.258929,0.297991,0.257813,0.292411,0.247871,817073600
2,2001-01-04,0.32394,0.330357,0.300223,0.304688,0.258278,739396000


**Let check, is the data sorted w.r.t date**

In [32]:
print(f'In Stock Data, the Data is sorted w.r.t Date: {stock_df["Date"].dt.date.is_monotonic_increasing}')
print(f'In News Data, the Data is sorted w.r.t Date: {news_df["Date"].dt.date.is_monotonic_increasing}')

In Stock Data, the Data is sorted w.r.t Date: True
In News Data, the Data is sorted w.r.t Date: True


**Hence, the datasets are sorted w.r.t date.**

Sorting data by date (by monotonic_increasing order) is crucial because stock prices often exhibit temporal dependencies. The order of historical prices can impact the current price, and models require chronological data to capture these patterns effectively. This sorting also simplifies time-based splitting, feature engineering, and visualization, making our analysis and predictions more accurate and reliable.

##**Stock Data Features Overview**

In [33]:
stock_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5658 entries, 0 to 5657
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       5658 non-null   datetime64[ns]
 1   Open       5658 non-null   float64       
 2   High       5658 non-null   float64       
 3   Low        5658 non-null   float64       
 4   Close      5658 non-null   float64       
 5   Adj Close  5658 non-null   float64       
 6   Volume     5658 non-null   int64         
dtypes: datetime64[ns](1), float64(5), int64(1)
memory usage: 309.5 KB


**Basic Statistics**

In [34]:
stock_df.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,5658.0,5658.0,5658.0,5658.0,5658.0,5658.0
mean,34.07056,34.453706,33.702501,34.094835,32.710705,403902500.0
std,47.150836,47.717334,46.630048,47.202338,46.930783,378411100.0
min,0.231964,0.235536,0.227143,0.234286,0.198599,35195900.0
25%,2.647054,2.673928,2.617589,2.647054,2.243854,131032900.0
50%,15.410714,15.547857,15.251429,15.371071,13.262447,282275000.0
75%,39.961874,40.248125,39.626875,39.964375,37.706119,546112700.0
max,189.080002,190.070007,188.940002,189.589996,189.085205,3372970000.0


**Null Values**

In [35]:
stock_df.isnull().sum()

Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

#**Merge the Stock and News Data**

In [36]:
df = pd.merge(stock_df, news_df, how='inner', on='Date')

In [37]:
df.head(5)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,News
0,2001-01-02,0.265625,0.272321,0.260045,0.265625,0.225165,452312000,Status quo will not be disturbed at Ayodhya; s...
1,2001-01-03,0.258929,0.297991,0.257813,0.292411,0.247871,817073600,Powerless north India gropes in the dark Think...
2,2001-01-04,0.32394,0.330357,0.300223,0.304688,0.258278,739396000,The string that pulled Stephen Hawking to Indi...
3,2001-01-05,0.302455,0.310268,0.28683,0.292411,0.247871,412356000,Light combat craft takes India into club class...
4,2001-01-08,0.302455,0.303292,0.284598,0.295759,0.250709,373699200,Sangh Parivar; Babri panel up the ante Frontru...


In [40]:
stock_df.shape, news_df.shape

((5658, 7), (8170, 2))

In [41]:
df.shape

(5627, 8)

In [42]:
df.duplicated().sum()

0

##**Data Preprocesssing/Analyis**

##**Data Cleaning**

In [43]:
df['News'].iloc[-11]

"Physiology Explained: The study of how the human body works Horoscope Today; June 14; 2023: Read your daily astrological predictions for Aquarius; Scorpio; Pisces and Others Bride calls off marriage after groom gets drunk during jaimala ceremony 5 summer wardrobe staples Tips to rain-proof your makeup this monsoon Optical illusion: Find 3 numbers hidden in this image; you only have 8 seconds! ap eamcet results 2023 to be released today at cets apsche ap gov in direct link here Kriti Sanon: It took a lot of effort to immerse myself into Janaki's persona High in calcium food options for 30+ women to improve bone health Top 10 Strategic Skills You Must Learn in 2023 Out&Proud@Work: 'Visible change; not Pride Month tokenism; the way ahead for Indian cos' Out&Proud@Work: Allyship creates a conducive work environment; say queer employees The battle will continue till we achieve basic human rights: Nishtha Nishant Out&Proud@Work: 'Active measures needed to create an inclusive workforce' sidh

In [44]:
import re

In [45]:
def clean_text(text):
    # Remove extra whitespaces and newline characters
    cleaned_text = re.sub(r'\s+', ' ', text).strip()

    # Convert to lowercase
    cleaned_text = cleaned_text.lower()

    # Remove punctuation and special characters
    cleaned_text = re.sub(r'[^a-z0-9\s]', '', cleaned_text)

    return cleaned_text

In [46]:
df['News'] = df['News'].apply(clean_text)

In [47]:
df['News'].iloc[-11]

'physiology explained the study of how the human body works horoscope today june 14 2023 read your daily astrological predictions for aquarius scorpio pisces and others bride calls off marriage after groom gets drunk during jaimala ceremony 5 summer wardrobe staples tips to rainproof your makeup this monsoon optical illusion find 3 numbers hidden in this image you only have 8 seconds ap eamcet results 2023 to be released today at cets apsche ap gov in direct link here kriti sanon it took a lot of effort to immerse myself into janakis persona high in calcium food options for 30 women to improve bone health top 10 strategic skills you must learn in 2023 outproudwork visible change not pride month tokenism the way ahead for indian cos outproudwork allyship creates a conducive work environment say queer employees the battle will continue till we achieve basic human rights nishtha nishant outproudwork active measures needed to create an inclusive workforce sidharth bhardwaj on his struggles

##**Sentiment Analysis**

In [48]:
from textblob import TextBlob

In [49]:
def subjectivity_score(text):
    return TextBlob(text).sentiment.subjectivity

In [51]:
df['Subjectivity'] = df['News'].apply(subjectivity_score)

In [52]:
def polarity_score(text):
    return TextBlob(text).sentiment.polarity

In [53]:
df['Polarity'] = df['News'].apply(polarity_score)

In [54]:
df.head(5)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,News,Subjectivity,Polarity
0,2001-01-02,0.265625,0.272321,0.260045,0.265625,0.225165,452312000,status quo will not be disturbed at ayodhya sa...,0.282333,0.151333
1,2001-01-03,0.258929,0.297991,0.257813,0.292411,0.247871,817073600,powerless north india gropes in the dark think...,0.407692,0.088462
2,2001-01-04,0.32394,0.330357,0.300223,0.304688,0.258278,739396000,the string that pulled stephen hawking to indi...,0.446847,0.090625
3,2001-01-05,0.302455,0.310268,0.28683,0.292411,0.247871,412356000,light combat craft takes india into club class...,0.476612,0.262024
4,2001-01-08,0.302455,0.303292,0.284598,0.295759,0.250709,373699200,sangh parivar babri panel up the ante frontrun...,0.414653,-0.019214


In [55]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

##**VADER (Valence Aware Dictionary and sEntiment Reasoner)**

In [56]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [57]:
SA = SentimentIntensityAnalyzer()

In [58]:
df['Compound'] = [SA.polarity_scores(news)['compound'] for news in df['News']]
df['Negative'] = [SA.polarity_scores(news)['neg'] for news in df['News']]
df['Neutral'] = [SA.polarity_scores(news)['neu'] for news in df['News']]
df['Positive'] = [SA.polarity_scores(news)['pos'] for news in df['News']]

##**Feature Selection**

**Use the Close Stock Price Column to Train Your Model.**

The closing price is a fundamental piece of information for stock analysis. It represents the final traded price for the day, which can be influenced by various factors and information available up to that point.

In [59]:
df.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'News',
       'Subjectivity', 'Polarity', 'Compound', 'Negative', 'Neutral',
       'Positive'],
      dtype='object')

In [60]:
selected_feat = df.columns.drop(['Date', 'Open', 'High', 'Low', 'Adj Close', 'Volume', 'News'])

In [61]:
final_df = df[selected_feat]
final_df.head(3)

Unnamed: 0,Close,Subjectivity,Polarity,Compound,Negative,Neutral,Positive
0,0.265625,0.282333,0.151333,-0.9792,0.121,0.808,0.071
1,0.292411,0.407692,0.088462,-0.1779,0.121,0.767,0.112
2,0.304688,0.446847,0.090625,0.8047,0.1,0.798,0.102


In [62]:
X = final_df.drop('Close', axis=1)
y = final_df['Close']

# **Data Splitting**

In [63]:
from sklearn.model_selection import train_test_split

In [184]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=0)

In [185]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4501, 6), (1126, 6), (4501,), (1126,))

#**Data Scaling**

In [188]:
from sklearn.preprocessing import StandardScaler

In [189]:
scaler_x = StandardScaler()

X_train_scaled = scaler_x.fit_transform(X_train)
X_test_scaled = scaler_x.transform(X_test)

In [190]:
scaler_y = StandardScaler()

y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = scaler_y.transform(y_test.values.reshape(-1, 1))

#**Models**

##**RandomForestRegressor**

In [357]:
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.metrics import mean_squared_error

In [399]:
rf = RandomForestRegressor(n_estimators=12, random_state=42, criterion='squared_error')

In [400]:
rf.fit(X_train_scaled, y_train_scaled.ravel())

prediction=rf.predict(X_test_scaled)

In [401]:
print(f'Root Mean Squared error (on Scaled Data): {np.sqrt(mean_squared_error(prediction,y_test_scaled)): .2f}')

Root Mean Squared error (on Scaled Data):  0.69


In [402]:
predictions_original_scale = scaler_y.inverse_transform(prediction.reshape(-1, 1))


mse = mean_squared_error(y_test, predictions_original_scale)

print(f"Root Mean Squared Error (on Original Scale): {np.sqrt(mse): .2f}")

Root Mean Squared Error (on Original Scale):  32.81


##**AdaBoostRegressor**

In [403]:
ABR = AdaBoostRegressor(n_estimators=5, random_state=42, loss='square', learning_rate=0.1)

In [404]:
ABR.fit(X_train_scaled, y_train_scaled.ravel())

ABR_prediction=ABR.predict(X_test_scaled)

In [405]:
print(f'Root Mean Squared error (on Scaled Data): {np.sqrt(mean_squared_error(ABR_prediction,y_test_scaled)): .2f}')

Root Mean Squared error (on Scaled Data):  0.71


In [406]:
predictions_original_scale = scaler_y.inverse_transform(ABR_prediction.reshape(-1, 1))


mse = mean_squared_error(y_test, predictions_original_scale)

print(f"Root Mean Squared Error (on Original Scale): {np.sqrt(mse): .2f}")

Root Mean Squared Error (on Original Scale):  33.80


##**DecisionTreeRegressor**

In [407]:
from sklearn.tree import DecisionTreeRegressor

In [411]:
DTR = DecisionTreeRegressor(criterion='squared_error')

In [412]:
DTR.fit(X_train_scaled, y_train_scaled.ravel())

DTR_prediction=DTR.predict(X_test_scaled)

In [413]:
print(f'Root Mean Squared error (on Scaled Data): {np.sqrt(mean_squared_error(DTR_prediction,y_test_scaled)): .2f}')

Root Mean Squared error (on Scaled Data):  0.85


In [414]:
predictions_original_scale = scaler_y.inverse_transform(DTR_prediction.reshape(-1, 1))


mse = mean_squared_error(y_test, predictions_original_scale)

print(f"Root Mean Squared Error (on Original Scale): {np.sqrt(mse): .2f}")

Root Mean Squared Error (on Original Scale):  40.49


**Hence, the RandomForestRegressor perform well among the models employed**