In [39]:
import pandas as pd
import re

# Load data
train_df = pd.read_csv('train.csv', names=['Sentiment', 'Text'], dtype={'Sentiment': str, 'Text': str}, low_memory=False)
train_df.dropna(inplace=True)

test_df = pd.read_csv('test.csv', names=['Sentiment', 'Text'], dtype={'Sentiment': str, 'Text': str})
test_df.dropna(inplace=True)

print('Train data size:', len(train_df))
print('Test data size:', len(test_df))

# Step 1: Exploratory Data Analysis
print("Data shape:", train_df.shape)
print("Sentiment distribution:\n", train_df['Sentiment'].value_counts())
print("Number of missing values:\n", train_df.isnull().sum())


Train data size: 1048576
Test data size: 360
Data shape: (1048576, 2)
Sentiment distribution:
 0            800000
1            248575
Sentiment         1
Name: Sentiment, dtype: int64
Number of missing values:
 Sentiment    0
Text         0
dtype: int64


In [40]:
# Step 2: Text Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove digits and special characters
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d', '', text)
    return text

train_df['Text'] = train_df['Text'].apply(preprocess_text)
test_df['Text'] = test_df['Text'].apply(preprocess_text)

In [41]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

# Step 3: Linguistic Feature Extraction
# Bag-of-words features
vectorizer_bow = CountVectorizer()
bow_features = vectorizer_bow.fit_transform(train_df['Text'])

# TF-IDF features
vectorizer_tfidf = TfidfVectorizer()
tfidf_features = vectorizer_tfidf.fit_transform(train_df['Text'])

In [42]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
# Step 4: Build your sentiment classification model
# Split the data into training and testing sets
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(bow_features[:len(train_df)], train_df['Sentiment'], test_size=0.2, random_state=42)
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(tfidf_features[:len(train_df)], train_df['Sentiment'], test_size=0.2, random_state=42)

# Logistic Regression model with Bag-of-words features
lr_bow = LogisticRegression(max_iter=100000)
lr_bow.fit(X_train_bow, y_train_bow)
y_pred_bow_lr = lr_bow.predict(X_test_bow)

# Logistic Regression model with TF-IDF features
lr_tfidf = LogisticRegression(max_iter=100000)
lr_tfidf.fit(X_train_tfidf, y_train_tfidf)
y_pred_tfidf_lr = lr_tfidf.predict(X_test_tfidf)

# Naive Bayes model with Bag-of-words features
nb_bow = MultinomialNB()
nb_bow.fit(X_train_bow, y_train_bow)
y_pred_bow_nb = nb_bow.predict(X_test_bow)

# Naive Bayes model with TF-IDF features
nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_tfidf, y_train_tfidf)
y_pred_tfidf_nb = nb_tfidf.predict(X_test_tfidf)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Step 5: Model evaluation
# Evaluate the models using accuracy, precision, recall, and F1 score
print("Logistic Regression with Bag-of-words features")
print("Accuracy:", accuracy_score(y_test_bow, y_pred_bow_lr))
print("Precision:", precision_score(y_test_bow, y_pred_bow_lr, average='weighted',zero_division=0))
print("Recall:", recall_score(y_test_bow, y_pred_bow_lr, average='weighted'))
print("F1 score:", f1_score(y_test_bow, y_pred_bow_lr, average='weighted'))

print("Logistic Regression with TF-IDF features")
print("Accuracy:", accuracy_score(y_test_tfidf, y_pred_tfidf_lr))
print("Precision:", precision_score(y_test_tfidf, y_pred_tfidf_lr, average='weighted',zero_division=0))
print("Recall:", recall_score(y_test_tfidf, y_pred_tfidf_lr, average='weighted'))
print("F1 score:", f1_score(y_test_tfidf, y_pred_tfidf_lr, average='weighted'))

Logistic Regression with Bag-of-words features
Accuracy: 0.8493772530469779
Precision: 0.8416011100852809
Recall: 0.8493772530469779
F1 score: 0.8407265186999815
Logistic Regression with TF-IDF features
Accuracy: 0.8529535180911327
Precision: 0.8458368485295767
Recall: 0.8529535180911327
F1 score: 0.8435115325701856
