In [1]:
# Import necessary modules
import pandas as pd
import numpy as np
import textblob
import nltk
import spacy
import os

In [2]:
# reading the CSV_file using pandas
tweets_data = pd.read_csv("Cleaned_Data.csv")

In [3]:
tweets_data.head()

Unnamed: 0,date,user,cleaned_text
0,2009-04-07 10:49:45,_TheSpecialOne_,
1,2009-04-07 10:49:49,scotthamilton,is upset that he update his Facebook by texti...
2,2009-04-07 10:49:53,mattycus,I dived many times for the ball Managed to sav...
3,2009-04-07 10:49:57,Karoli,no not behaving at all mad why am i here bec...
4,2009-04-07 10:49:57,ElleCTF,my whole body feels itchy and like its on fire


In [4]:
tweets_data = tweets_data.drop(columns="user")
tweets_data.head()

Unnamed: 0,date,cleaned_text
0,2009-04-07 10:49:45,
1,2009-04-07 10:49:49,is upset that he update his Facebook by texti...
2,2009-04-07 10:49:53,I dived many times for the ball Managed to sav...
3,2009-04-07 10:49:57,no not behaving at all mad why am i here bec...
4,2009-04-07 10:49:57,my whole body feels itchy and like its on fire


In [5]:
tweets_data.dtypes

date            object
cleaned_text    object
dtype: object

In [6]:
# Check for missing values
missing_values = tweets_data['cleaned_text'].isnull().sum()
print(f"Number of missing values: {missing_values}")

# check for duplicates
duplicate_rows = tweets_data.duplicated().sum()
print(f"Number of duplicate_rows: {duplicate_rows}")

Number of missing values: 22848
Number of duplicate_rows: 897


In [7]:
# droping null values
tweets_data = tweets_data.dropna()

# check for duplicates
tweets_data = tweets_data.drop_duplicates()

In [8]:
# Check the new shape of tweets_data
tweets_data.shape

(1574935, 2)

In [9]:
# importing the text analyzer module called textblob
from textblob import *

def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

# Create a function to get the polarity
def getPolarity(text):
    return  TextBlob(text).sentiment.polarity

In [10]:
# Convert non-string entries to string
tweets_data['cleaned_text'] = tweets_data['cleaned_text'].astype(str)

# Apply sentiment analysis function
tweets_data['Subjectivity'] = tweets_data['cleaned_text'].apply(getSubjectivity)

In [11]:
tweets_data['Polarity'] = tweets_data['cleaned_text'].apply(getPolarity)

In [12]:
def getAnalysis(row):
    if row['Subjectivity'] > 0.5:
            if row['Polarity'] > 0:
                return 'Positive'
            elif row['Polarity'] < 0:
                return 'Negative'
            else:
                return 'Neutral'
    else:
        return 'Neutral'  
# Applying categorization to your DataFrame
tweets_data['Sentiment'] = tweets_data.apply(getAnalysis, axis=1)

In [13]:
# getting the label to the corresponding sentiment label.
tweets_data['Sentiment_label']=tweets_data['Sentiment'].map({'Positive':1,'Neutral':0,'Negative':-1})

In [14]:
tweets_data.to_csv("Tweets_with_sentiment.csv")

In [15]:
tweets_data.Sentiment.value_counts()

Sentiment
Neutral     973048
Positive    383584
Negative    218303
Name: count, dtype: int64

In [16]:
tweets_data

Unnamed: 0,date,cleaned_text,Subjectivity,Polarity,Sentiment,Sentiment_label
1,2009-04-07 10:49:49,is upset that he update his Facebook by texti...,0.000000,0.000000,Neutral,0
2,2009-04-07 10:49:53,I dived many times for the ball Managed to sav...,0.500000,0.500000,Neutral,0
3,2009-04-07 10:49:57,no not behaving at all mad why am i here bec...,1.000000,-0.625000,Negative,-1
4,2009-04-07 10:49:57,my whole body feels itchy and like its on fire,0.400000,0.200000,Neutral,0
5,2009-04-07 10:50:00,not the whole crew,0.400000,0.200000,Neutral,0
...,...,...,...,...,...,...
1598122,2009-06-25 22:58:28,Tried to get the mutant Fawkes to follow me bu...,0.700000,-0.100000,Negative,-1
1598123,2009-06-25 22:58:28,Gmail is down,0.288889,-0.155556,Neutral,0
1598124,2009-06-25 22:58:30,Sounds like a rival is flagging your ads Not m...,0.200000,-0.100000,Neutral,0
1598125,2009-06-25 22:58:30,rest in peace Farrah So sad,1.000000,-0.500000,Negative,-1


In [None]:
from textblob import TextBlob
import pandas as pd

# Function to extract the most frequent noun as the topic
def extract_topic(text):
    blob = TextBlob(text)
    nouns = [word for word, pos in blob.tags if pos == 'NN']  # Extract nouns
    if nouns:
        return max(set(nouns), key=nouns.count)  # Return the most frequent noun
    else:
        return None

# Apply the function to create a new 'topic' column
tweets_data['topic'] = tweets_data['cleaned_text'].apply(extract_topic)

# Display the DataFrame with the new 'topic' column
print(tweets_data)

In [19]:
tweets_data.columns

Index(['date', 'cleaned_text', 'Subjectivity', 'Polarity', 'Sentiment',
       'Sentiment_label'],
      dtype='object')

In [None]:
# selecting important features
tweets_data = tweets_data[['date', 'cleaned_text', 'Sentiment','Sentiment_label', 'topic']]

In [None]:
tweets_data.head()

In [None]:
# Save the DataFrame to a CSV file
csv_file_path = 'Tweets_with_sentiments.csv'

tweets_data.to_csv(csv_file_path, index=False)

# Optional: Display a message indicating successful saving
print(f"DataFrame saved to CSV file: {csv_file_path}")