## Import the Libraries

In [None]:
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import swifter
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter

from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

## Load the Data

In [None]:
df=pd.read_csv('/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv',encoding = 'latin',header=None)
df.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']
df.info()
display(df.head())

## Text preprocessing

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
def text_preprocess(text):
    # lowering the text
    text=text.lower()

    # removing URLs , mentions and hashtags
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)

    # removing punctuation and digits
    text = re.sub(r'[{}]'.format(string.punctuation), ' ', text)
    text = re.sub(r'\d+', '', text)

    # remove extra white space
    text = re.sub(r'\s+', ' ', text).strip()

    # tokenize the text
    tokens=nltk.word_tokenize(text)

    # remove stopwords
    stop_words=set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # lemmatize the text
    lemmatizer=WordNetLemmatizer()
    tokens=[lemmatizer.lemmatize(word) for word in tokens]

    # join back the tokens 
    processed_text=  ' '.join(tokens)

    return processed_text

In [None]:
data=df[['target','text']].copy()

In [None]:
data['cleaned_text']=data['text'].swifter.apply(text_preprocess)

In [None]:
data.to_csv('/kaggle/working/preprocessed_data.csv',index=False)

## EDA

In [None]:
# Semtiment label distribution
sns.countplot(data=data,x='target',palette='viridis')
plt.title('Sentiment Class Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

In [None]:
# Tweet lengh analysis
data['tweet_lengh']=data['text'].apply(lambda a : len(a.split()))

sns.histplot(data=data, x='tweet_lengh', bins=30, hue='target', multiple='stack')
plt.title('Tweet Length by Sentiment')
plt.xlabel('Number of Words')
plt.ylabel('Tweet Count')
plt.show()

In [None]:
# Function to generate a word cloud per sentiment
def plot_wordcloud(sentiment):
    text = ' '.join(data[data['target'] == sentiment]['cleaned_text'])
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Word Cloud - {sentiment} Sentiment')
    plt.show()

for s in [0,4]:
    plot_wordcloud(s)


In [None]:
def get_top_n_words(df, sentiment_label, n=10):

    subset = data[data['target']==sentiment_label]

    all_words=' '.join(subset['cleaned_text']).split()

    word_frequency=Counter(all_words)

    return word_frequency.most_common(n)

for sentiment in [0,4]:
    print(f"\nTop words for {sentiment} sentiment:")
    top_words = get_top_n_words(df, sentiment)
    for word, count in top_words:
        print(f"{word}: {count}")


## Modeling

In [None]:
X=data['cleaned_text']
y=data['target']

vectorizer=TfidfVectorizer(max_features=10000)
X=vectorizer.fit_transform(X)

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.1,random_state=42)


In [None]:
lr=LogisticRegression(max_iter=10000)
lr.fit(X_train,y_train)

y_pred=lr.predict(X_test)

print(classification_report(y_test,y_pred))