In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.inspection import permutation_importance
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

Load the dataset

In [None]:
data = pd.read_csv('train.csv')

Data exploration

In [None]:
print(data.head())
print(data.info())

Text preprocessing

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    words = word_tokenize(text.lower())
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

data['Text'] = data['Text'].apply(preprocess_text)

Exploratory Data Analysis

In [None]:
print(data['Sentiment'].value_counts())

Visualize the distribution of sentiment labels

In [None]:
plt.figure(figsize=(10, 5))
data['Sentiment'].value_counts().plot(kind='bar')
plt.title('Distribution of Sentiment Labels')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

Word cloud visualization

In [None]:
wordcloud = WordCloud(width=800, height=500, background_color='white', max_words=200, colormap='tab20').generate_from_frequencies(data['Text'].str.split().explode().value_counts())
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

Text vectorization

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['Text'])

Label encoding

In [None]:
encoder = LabelEncoder()
y = encoder.fit_transform(data['Sentiment'])

Model selection

In [None]:
models = {
    'Naive Bayes': MultinomialNB(),
    'SVM': SVC(probability=True),
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression()
}

Model evaluation

In [None]:
for name, model in models.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f'{name} Performance:')
    print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
    print(f'Precision: {precision_score(y_test, y_pred, average="weighted")}')
    print(f'Recall: {recall_score(y_test, y_pred, average="weighted")}')
    print(f'F1 Score: {f1_score(y_test, y_pred, average="weighted")}')
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    print('Classification Report:')
    print(classification_report(y_test, y_pred))