In [1]:
import pandas as pd
import re

In [2]:
df=pd.read_csv("bank_reviews3.csv")
print(df.shape)
print(df.head())

(1000, 10)
        author          date    address       bank  rating  \
0  AMRENDRA  T  Mar 21, 2020  New delhi        SBI     4.0   
1       BISHWA  Mar 20, 2020    Kolkata        SBI     5.0   
2      SANTOSH  Mar 20, 2020    Hooghly  Axis Bank     5.0   
3      MAHADEV  Mar 20, 2020       Pune  HDFC Bank     5.0   
4            R  Mar 20, 2020  Bangalore     review     5.0   

  review_title_by_user                                             review  \
0        "Best saving"  State Bank Of India is located nearby in our a...   
1       "Good service"  I have my salary account in SBI, when I applie...   
2  "Excellent Service"  I am using Axis bank saving account for the  p...   
3  "Excellent service"  I have my salary bank account in HDFC bank for...   
4       "Good account"  Close to around 10 years, I am holding this Co...   

                                          bank_image rating_title_by_user  \
0  https://static.bankbazaar.com/images/common/ba...               Great!   

In [3]:
print(df.isnull().sum())

author                  4
date                    0
address                 0
bank                    0
rating                  0
review_title_by_user    0
review                  0
bank_image              0
rating_title_by_user    0
useful_count            0
dtype: int64


In [4]:
#clean function for review column
def clean_text(text):
    text = str(text).lower()  # Lowercase and handle non-string safety
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-z\s]', '', text)  # Remove punctuation and numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

In [5]:
df['cleaned_review'] = df['review'].apply(clean_text)

In [6]:
print(df['cleaned_review'].head())

0    state bank of india is located nearby in our a...
1    i have my salary account in sbi when i applied...
2    i am using axis bank saving account for the pa...
3    i have my salary bank account in hdfc bank for...
4    close to around years i am holding this corpor...
Name: cleaned_review, dtype: object


In [7]:
print(df.duplicated().sum())

0


In [8]:
df['date'] = pd.to_datetime(df['date'], format='%b %d, %Y')
print(df['date'].head())

0   2020-03-21
1   2020-03-20
2   2020-03-20
3   2020-03-20
4   2020-03-20
Name: date, dtype: datetime64[ns]


In [9]:
df['month'] = df['date'].dt.to_period('M').astype(str)

In [10]:
keep_cols = ['date', 'month','bank', 'rating', 'cleaned_review']
df = df[keep_cols]

In [11]:
df.to_csv('banks_reviews_cleaned.csv', index=False)

##### Sentiment Analysis

In [12]:
import nltk #type:ignore
from nltk.sentiment.vader import SentimentIntensityAnalyzer #type:ignore

In [13]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\arpit\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [14]:
sia= SentimentIntensityAnalyzer()

In [15]:
def get_sentiment(text):
    score = sia.polarity_scores(str(text))['compound']
    return score
df['sentiment_score'] = df['cleaned_review'].apply(get_sentiment)

In [16]:
def sentiment_category(score):
    if score >= 0.05:
        return 'Positive'
    elif score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'
df['sentiment'] = df['sentiment_score'].apply(sentiment_category)

In [17]:
print(df.head())

        date    month       bank  rating  \
0 2020-03-21  2020-03        SBI     4.0   
1 2020-03-20  2020-03        SBI     5.0   
2 2020-03-20  2020-03  Axis Bank     5.0   
3 2020-03-20  2020-03  HDFC Bank     5.0   
4 2020-03-20  2020-03     review     5.0   

                                      cleaned_review  sentiment_score  \
0  state bank of india is located nearby in our a...           0.7264   
1  i have my salary account in sbi when i applied...           0.7165   
2  i am using axis bank saving account for the pa...           0.4588   
3  i have my salary bank account in hdfc bank for...           0.5267   
4  close to around years i am holding this corpor...           0.7357   

  sentiment  
0  Positive  
1  Positive  
2  Positive  
3  Positive  
4  Positive  


In [18]:
df.to_csv('banks_reviews_sentiment.csv', index=False)

In [19]:
monthly_sentiment=df.groupby('month')['sentiment_score'].mean().reset_index()
print(monthly_sentiment.head())

     month  sentiment_score
0  2019-11         0.248360
1  2019-12         0.333970
2  2020-01         0.414119
3  2020-02         0.330338
4  2020-03         0.336559


In [20]:
monthly_sentiment.to_csv('monthly_sentiment.csv', index=False)

##### Forecasting monthly sentiment score

In [22]:
from prophet import Prophet  #type:ignore

In [23]:
df = pd.read_csv('monthly_sentiment.csv')

In [24]:
df = df.rename(columns={'month': 'ds', 'sentiment_score': 'y'})
df['ds'] = pd.to_datetime(df['ds'])
model = Prophet()
model.fit(df)

future = model.make_future_dataframe(periods=6, freq='M') #for 6 months horizon
forecast=model.predict(future)
print(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail())

01:02:16 - cmdstanpy - INFO - Chain [1] start processing
01:02:17 - cmdstanpy - INFO - Chain [1] done processing
01:02:17 - cmdstanpy - INFO - Chain [1] done processing


           ds      yhat  yhat_lower  yhat_upper
6  2020-04-30  0.401164    0.342027    0.458694
7  2020-05-31  0.418836    0.361801    0.477065
8  2020-06-30  0.435937    0.379091    0.499742
9  2020-07-31  0.453608    0.397451    0.514884
10 2020-08-31  0.471280    0.410376    0.528884


  dates = pd.date_range(


In [25]:
forecast.to_csv('monthly_sentiment_forecast.csv', index=False)