In [None]:
# Import necessary modules
import pandas as pd

from pyspark.sql import SparkSession
from pymongo import MongoClient

from pyspark.sql.functions import col
from pyspark.sql.types import DateType

import os


In [None]:
# Load environment variables from the .env file
from dotenv import load_dotenv
load_dotenv()

# Access the MongoDB URI from the environment variables
mongodb_uri = os.getenv("uri")


In [None]:
# Create a SparkSession
spark = SparkSession.builder.getOrCreate()

# Establish a connection to the MongoDB server

# client = MongoClient("mongodb://localhost:27017/")

client = MongoClient(mongodb_uri)

In [None]:
# Access the desired database and collection
db = client["Big_Tweet"]
collection = db["Tweets"]

# Fetch the data from the collection
data = list(collection.find())

# Convert the data to a Pandas DataFrame
df = pd.DataFrame(data)

# Print the resulting DataFrame
#print(df)

In [None]:
# Reading a CSV file
df = spark.read.csv("Timeseries_tweets.csv", header=True, inferSchema=True)

# Convert the Pandas DataFrame to a Spark DataFrame
#spark_df = spark.createDataFrame(df)

# Reading a CSV file directly into a Spark DataFrame
#spark_df = spark.read.csv("Timeseries_tweets.csv", header=True, inferSchema=True)


In [None]:
df.show()

In [None]:
#  droping unnecessary columns
df=df.drop('_c0')

In [None]:
# take a look of data
df.show()

In [None]:
# printing data schema
df.printSchema()

In [None]:
# to change the date in a proper format importing necessary module
from pyspark.sql.functions import date_format

In [None]:
# changing the date in year-month-day format
df = df.withColumn('date', date_format(df['date'], 'yyyy-MM-dd'))

In [None]:
df.show()

In [None]:
# for the tweets cleaning importing all necessary module that help us to work with text data
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from pyspark.sql.functions import col, regexp_replace

In [None]:
#  for the cleaning the text Define the cleaning function
def clean_text(text):
    #  for the Remove numbers
    cleaned_text = regexp_replace(text, r'\d+', '')
    
    # for the Remove mentions
    cleaned_text = regexp_replace(cleaned_text, r'@[A-Za-z0-9_]+', '')
    
    #  for the Remove email addresses
    cleaned_text = regexp_replace(cleaned_text, r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9+._-]+\.[a-zA-Z0-9+._-]+)', '')
    
    # for the Remove hyperlinks
    cleaned_text = regexp_replace(cleaned_text, r'https?:\/\/.*[\r\n]*', '')
    cleaned_text = regexp_replace(cleaned_text, r'http?:\/\/.*[\r\n]*', '')
    
    #  for the Remove hashtags
    cleaned_text = regexp_replace(cleaned_text, r'#[A-Za-z0-9_]+', '')
    
    # Remove brackets
    cleaned_text = regexp_replace(cleaned_text, r" ?\([^)]+\)", "")
    
    #  for the Remove HTML tags
    cleaned_text = regexp_replace(cleaned_text, r'[<.*?>]+', '')
    
    #  TO Remove apostrophe words
    cleaned_text = regexp_replace(cleaned_text, r"ain\'t", "")
    cleaned_text = regexp_replace(cleaned_text, r"aren\'t", "")
    cleaned_text = regexp_replace(cleaned_text, r"can\'t", "")
    cleaned_text = regexp_replace(cleaned_text, r"can't've", "")
    cleaned_text = regexp_replace(cleaned_text, r"'cause", "")
    cleaned_text = regexp_replace(cleaned_text, r"couldn\'t", "")
    cleaned_text = regexp_replace(cleaned_text, r"could\'ve", "")
    cleaned_text = regexp_replace(cleaned_text, r"couldn\'t", "")
    cleaned_text = regexp_replace(cleaned_text, r"couldn\'t've", "")
    cleaned_text = regexp_replace(cleaned_text, r"didn\'t", "")
    cleaned_text = regexp_replace(cleaned_text, r"doesn\'t", "")
    cleaned_text = regexp_replace(cleaned_text, r"don\'t", "")
    cleaned_text = regexp_replace(cleaned_text, r"hadn\'t", "")
    cleaned_text = regexp_replace(cleaned_text, r"hadn\'t\'ve", "")
    cleaned_text = regexp_replace(cleaned_text, r"hasn\'t", "")
    cleaned_text = regexp_replace(cleaned_text, r"haven\'t", "")
    cleaned_text = regexp_replace(cleaned_text, r"he\'d", "")
    cleaned_text = regexp_replace(cleaned_text, r"he\'d\'ve", "")
    cleaned_text = regexp_replace(cleaned_text, r"he\'ll", "")
    cleaned_text = regexp_replace(cleaned_text, r"he\'ll've", "")
    cleaned_text = regexp_replace(cleaned_text, r"he\'s", "")
    cleaned_text = regexp_replace(cleaned_text, r"how\'d", "")
    cleaned_text = regexp_replace(cleaned_text, r"how\'d\'y", "")
    cleaned_text = regexp_replace(cleaned_text, r"how\'ll", "")
    cleaned_text = regexp_replace(cleaned_text, r"how\'s", "")
    cleaned_text = regexp_replace(cleaned_text, r"i\'d", "")
    cleaned_text = regexp_replace(cleaned_text, r"i\'d\'ve", "")
    cleaned_text = regexp_replace(cleaned_text, r"i\'ll", "")
    cleaned_text = regexp_replace(cleaned_text, r"i\'ll\'ve", "")
    cleaned_text = regexp_replace(cleaned_text, r"i\'m", "")
    cleaned_text = regexp_replace(cleaned_text, r"i\'ve", "")
    cleaned_text = regexp_replace(cleaned_text, r"isn\'t", "")
    cleaned_text = regexp_replace(cleaned_text, r"it\'d", "")
    cleaned_text = regexp_replace(cleaned_text, r"it\'d\'ve", "")
    cleaned_text = regexp_replace(cleaned_text, r"it\'ll", "")
    cleaned_text = regexp_replace(cleaned_text, r"it\'ll\'ve", "")
    cleaned_text = regexp_replace(cleaned_text, r"it\'s", "")
    cleaned_text = regexp_replace(cleaned_text, r"let\'s", "")
    cleaned_text = regexp_replace(cleaned_text, r"ma\'am", "")
    cleaned_text = regexp_replace(cleaned_text, r"mayn\'t", "")
    cleaned_text = regexp_replace(cleaned_text, r"might\'ve", "")
    cleaned_text = regexp_replace(cleaned_text, r"mightn\'t", "")
    cleaned_text = regexp_replace(cleaned_text, r"mightn\'t\'ve", "")
    cleaned_text = regexp_replace(cleaned_text, r"mustn\'t", "")
    cleaned_text = regexp_replace(cleaned_text, r"mustn\'t\'ve", "")
    cleaned_text = regexp_replace(cleaned_text, r"needn\'t", "")
    cleaned_text = regexp_replace(cleaned_text, r"needn\'t\'ve", "")
    cleaned_text = regexp_replace(cleaned_text, r"o\'clock", "")
    cleaned_text = regexp_replace(cleaned_text, r"oughtn\'t", "")
    cleaned_text = regexp_replace(cleaned_text, r"oughtn\'t\'ve", "")
    cleaned_text = regexp_replace(cleaned_text, r"shan\'t", "")
    cleaned_text = regexp_replace(cleaned_text, r"sha\'n\'t", "")
    cleaned_text = regexp_replace(cleaned_text, r"shan\'t\'ve", "")
    cleaned_text = regexp_replace(cleaned_text, r"she\'d", "")
    cleaned_text = regexp_replace(cleaned_text, r"she\'d\'ve", "")
    cleaned_text = regexp_replace(cleaned_text, r"she\'ll", "")
    cleaned_text = regexp_replace(cleaned_text, r"she\'ll\'ve", "")
    cleaned_text = regexp_replace(cleaned_text, r"she\'s", "")
    cleaned_text = regexp_replace(cleaned_text, r"should\'ve", "")
    cleaned_text = regexp_replace(cleaned_text, r"shouldn\'t", "")
    cleaned_text = regexp_replace(cleaned_text, r"shouldn\'t\'ve", "")
    cleaned_text = regexp_replace(cleaned_text, r"so\'ve", "")
    cleaned_text = regexp_replace(cleaned_text, r"so\'s", "")
    cleaned_text = regexp_replace(cleaned_text, r"that\'d", "")
    cleaned_text = regexp_replace(cleaned_text, r"that\'d\'ve", "")
    cleaned_text = regexp_replace(cleaned_text, r"that\'s", "")
    cleaned_text = regexp_replace(cleaned_text, r"there\'d", "")
    cleaned_text = regexp_replace(cleaned_text, r"there\'d\'ve", "")
    cleaned_text = regexp_replace(cleaned_text, r"there\'s", "")
    cleaned_text = regexp_replace(cleaned_text, r"they\'d", "")
    cleaned_text = regexp_replace(cleaned_text, r"they\'d\'ve", "")
    cleaned_text = regexp_replace(cleaned_text, r"they\'ll", "")
    cleaned_text = regexp_replace(cleaned_text, r"they\'ll\'ve", "")
    cleaned_text = regexp_replace(cleaned_text, r"they\'re", "")
    cleaned_text = regexp_replace(cleaned_text, r"they\'ve", "")
    cleaned_text = regexp_replace(cleaned_text, r"to\'ve", "")
    cleaned_text = regexp_replace(cleaned_text, r"wasn\'t", "")
    cleaned_text = regexp_replace(cleaned_text, r"we\'d", "")
    cleaned_text = regexp_replace(cleaned_text, r"we\'d\'ve", "")
    cleaned_text = regexp_replace(cleaned_text, r"we\'ll", "")
    cleaned_text = regexp_replace(cleaned_text, r"we\'ll\'ve", "")
    cleaned_text = regexp_replace(cleaned_text, r"we\'re", "")
    cleaned_text = regexp_replace(cleaned_text, r"we\'ve", "")
    cleaned_text = regexp_replace(cleaned_text, r"weren\'t", "")
    cleaned_text = regexp_replace(cleaned_text, r"what\'ll", "")
    #  for the Remove special characters
    cleaned_text = regexp_replace(cleaned_text, r'([_|!|%|^|&|*^|\|~|=|$\|/|.,!?/:;\"\'\“\”\’]+)', '')
    
    #  for the Remove double quotes
    cleaned_text = regexp_replace(cleaned_text, r'[""]', '')
    
    #  for the Remove dots
    cleaned_text = regexp_replace(cleaned_text, r'[.|.^]+', '')
    
    return cleaned_text

# now we Apply the cleaning function to the text column
df = df.withColumn('cleaned_text', clean_text(col('text')))

#  resulting DataFrame
df.show()

### Storing the cleaned data into Mongo Database

In [None]:
# saving clean data to a object
clean_text=df.select('Date','cleaned_text')

In [None]:
# converting data to pandas data frame for the forcasting purpuse
clean_df = clean_text.toPandas()
clean_df.to_csv("clean_timeseries_tweets.csv", index=False)

In [None]:
# Converting data into disctionary  
clean_dt = clean_df.to_dict(orient="records")

# Creating a collection in the database
db.Cleaned_Tweets.insert_many(clean_dt)

### Reading cleaned data from mongo database

In [None]:
# reading the clean tweet for analysis and forecasting the sentiment lable
#tweets=pd.read_csv("clean_timeseries_tweets.csv")

In [None]:
# Access the desired database and collection
db = client["Big_Tweet"]
collection = db["Cleaned_Tweets"]

# Fetch the data from the collection
data = list(collection.find())

# Convert the data to a Pandas DataFrame
tweets = pd.DataFrame(data)

In [None]:
# checking for null value
tweets.isna().sum()

In [None]:
# droping the null value
tweets.dropna(inplace=True)

In [None]:
tweets

In [None]:
# importing the text analyzer module called textblob
from textblob import *

def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

# Create a function to get the polarity
def getPolarity(text):
    return  TextBlob(text).sentiment.polarity

# Create two new columns 'Subjectivity' & 'Polarity'
tweets['Subjectivity'] = tweets['cleaned_text'].apply(getSubjectivity)
tweets['Polarity'] = tweets['cleaned_text'].apply(getPolarity)

def getAnalysis(score):
      if score < 0:
        return 'Negative'
      elif score == 0:
        return 'Neutral'
      else:
        return 'Positive'

In [None]:
# to get the sentiment label based on Polarity of text
tweets['Sentiment'] = tweets['Polarity'].apply(getAnalysis)

In [None]:
tweets

In [None]:
tweets

In [None]:
# getting the label to the corresponding sentiment label.
tweets['Sentiment_label']=tweets['Sentiment'].map({'Positive':1,'Neutral':0,'Negative':-1})

In [None]:
tweets

### `Ploting the sentiment of people over the time on tweeter`

In [None]:
import pandas as pd
import plotly.graph_objects as go

# Group the data by date and sentiment category and count the occurrences
sentiment_counts = tweets.groupby(['Date', 'Sentiment']).size().unstack(fill_value=0)

# Create a list of colors for each sentiment category
colors = {'Positive': 'green', 'Negative': 'red', 'Neutral': 'blue'}

# Create a line chart for each sentiment category
fig = go.Figure()
for sentiment in sentiment_counts.columns:
    fig.add_trace(go.Scatter(x=sentiment_counts.index, y=sentiment_counts[sentiment],
                             mode='lines',
                             name=sentiment,
                             line=dict(color=colors[sentiment]),
                             marker=dict(symbol='circle', size=8)))
    
# Update layout
fig.update_layout(title='Sentiment Over Time',
                  xaxis_title='Date',
                  yaxis_title='Count',
                  legend_title='Sentiment')

# Set figure size
fig.update_layout(width=1200, height=600)

# Show the plot
fig.show()


### `Ploting the sentiment of people over the week on tweeter`

In [None]:

# Convert the index to a datetime type
sentiment_counts.index = pd.to_datetime(sentiment_counts.index)

# Create a line chart for each sentiment category
fig = go.Figure()
for sentiment in sentiment_counts.columns:
    fig.add_trace(go.Scatter(
        x=sentiment_counts.index,
        y=sentiment_counts[sentiment],
        mode='lines',
        name=sentiment,
        line=dict(color=colors[sentiment], width=2),
        hovertemplate=': %{x}<br>' +
                      'Sentiment: ' + sentiment + '<br>' +
                      'Count: %{y}<br>' +
                      'Percentage: %{text}%<extra></extra>',
        text=((sentiment_counts[sentiment] / sentiment_counts.sum(axis=1)) * 100).round(2)
    ))

# Update layout
fig.update_layout(
    title='Sentiment Over Weeks',
    xaxis=dict(
        tickmode='linear',
        tickangle=45,
        showticklabels=False,
        dtick='7D',  # Set tick frequency to one week (7 days)
        tickformat='%Y-%m-%d'  # Format the date as desired
    ),
    xaxis_title='',
    yaxis_title='Count',
    legend_title='Sentiment'
)

# Set figure size
fig.update_layout(width=1200, height=600)

# Show the plot
fig.show()


### `Ploting the sentiment of people over the Month on tweeter`

In [None]:
# Convert the index to a datetime type
sentiment_counts.index = pd.to_datetime(sentiment_counts.index)

# Create a line chart for each sentiment category
fig = go.Figure()
for sentiment in sentiment_counts.columns:
    fig.add_trace(go.Scatter(
        x=sentiment_counts.index,
        y=sentiment_counts[sentiment],
        mode='lines',
        name=sentiment,
        line=dict(color=colors[sentiment], width=2),
        hovertemplate=': %{x}<br>' +
                      'Sentiment: ' + sentiment + '<br>' +
                      'Count: %{y}<br>' +
                      'Percentage: %{text}%<extra></extra>',
        text=((sentiment_counts[sentiment] / sentiment_counts.sum(axis=1)) * 100).round(2)
    ))

# Update layout
fig.update_layout(
    title='Sentiment Over Months',
    xaxis=dict(
        tickmode='linear',
        tickangle=45,
        showticklabels=True,
        dtick='M1',  # Set tick frequency to one month
        tickformat='%Y-%m-%d'  # Format the date as desired
    ),
    xaxis_title='Month',
    yaxis_title='Count',
    legend_title='Sentiment'
)

# Set figure size
fig.update_layout(width=1200, height=600)

# Show the plot
fig.show()


In [None]:
### `Ploting the sentiment of people over the Quater on tweeter`

In [None]:
# Assuming 'Date' is the datetime column in your 'tweets' DataFrame
tweets['Date'] = pd.to_datetime(tweets['Date'])

# Set 'Date' as the index
tweets.set_index('Date', inplace=True)

# Group the data by quarter and sentiment category and count the occurrences
sentiment_counts_quarterly = tweets.groupby([pd.Grouper(freq='Q'), 'Sentiment']).size().unstack(fill_value=0)

# Create a new datetime index
quarterly_dates = pd.date_range(start=sentiment_counts_quarterly.index.min(), end=sentiment_counts_quarterly.index.max(), freq='Q')

# Assign the new datetime index to sentiment_counts_quarterly
sentiment_counts_quarterly.index = quarterly_dates

# Now, create a line chart for each sentiment category
fig_quarterly = go.Figure()
for sentiment in sentiment_counts_quarterly.columns:
    fig_quarterly.add_trace(go.Scatter(
        x=sentiment_counts_quarterly.index,
        y=sentiment_counts_quarterly[sentiment],
        mode='lines',
        name=sentiment,
        line=dict(color=colors[sentiment], width=2),
        hovertemplate='Quarter: %{x}<br>' +
                      'Sentiment: ' + sentiment + '<br>' +
                      'Count: %{y}<br>' +
                      'Percentage: %{text}%<extra></extra>',
        text=((sentiment_counts_quarterly[sentiment] / sentiment_counts_quarterly.sum(axis=1)) * 100).round(2)
    ))

# Update layout
fig_quarterly.update_layout(
    title='Sentiment Over Quarters',
    xaxis=dict(
        tickmode='array',
        tickvals=sentiment_counts_quarterly.index,
        ticktext=sentiment_counts_quarterly.index.strftime('%b %Y'),
        showticklabels=True
    ),
    xaxis_title='Quarter',
    yaxis_title='Count',
    legend_title='Sentiment'
)

# Set figure size
fig_quarterly.update_layout(width=1200, height=600)

# Show the plot
fig_quarterly.show()
