In [None]:
# Title: Sentiment Analysis of Social Media Comments
# Description: This notebook preprocesses data, trains various models for sentiment analysis, and evaluates their performance.

# Step 1: Import Necessary Libraries
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import seaborn as sns

# Download NLTK data (run only once)
nltk.download('stopwords')
nltk.download('punkt')


In [None]:
# Step 2: Load and Explore Dataset
# Load data and display basic information
df = pd.read_csv('SaadLamjarred_LV.csv')
df = df[['Comments', 'Label']].dropna()  # Keep only relevant columns and remove null values
print(f"Dataset shape: {df.shape}")
print("First few rows of the dataset:")
print(df.head())


In [None]:
# Step 3: Data Preprocessing
# Preprocessing function to clean text data
def preprocess_text(text):
    """
    Preprocess text by removing special characters, converting to lowercase, 
    removing stopwords, and applying stemming.
    """
    text = re.sub(r'[^\w\s]', '', text.lower())  # Remove special characters and lowercase
    stop_words = set(stopwords.words('arabic'))  # Adjust stop words based on language
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in text.split() if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing to the 'Comments' column
df['processed_comments'] = df['Comments'].apply(preprocess_text)
print("Sample processed comments:")
print(df['processed_comments'].head())

In [None]:
# Step 5: Split Data into Training, Validation, and Test Sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)


In [None]:
# Step 6: Model Training and Evaluation
# Train and evaluate multiple models

# Logistic Regression
lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)
print("Logistic Regression")
print("Training accuracy:", lr.score(X_train, y_train))
print("Validation accuracy:", lr.score(X_val, y_val))
print("Test accuracy:", lr.score(X_test, y_test))
print()

# Naive Bayes
nb = MultinomialNB()
nb.fit(X_train, y_train)
print("Naive Bayes")
print("Training accuracy:", nb.score(X_train, y_train))
print("Validation accuracy:", nb.score(X_val, y_val))
print("Test accuracy:", nb.score(X_test, y_test))
print()

# Linear Discriminant Analysis (LDA)
lda = LinearDiscriminantAnalysis()
lda.fit(X_train.toarray(), y_train)
print("LDA")
print("Training accuracy:", lda.score(X_train.toarray(), y_train))
print("Validation accuracy:", lda.score(X_val.toarray(), y_val))
print("Test accuracy:", lda.score(X_test.toarray(), y_test))
print()

# Quadratic Discriminant Analysis (QDA)
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train.toarray(), y_train)
print("QDA")
print("Training accuracy:", qda.score(X_train.toarray(), y_train))
print("Validation accuracy:", qda.score(X_val.toarray(), y_val))
print("Test accuracy:", qda.score(X_test.toarray(), y_test))
print()

In [None]:

# Step 7: Logistic Regression with Grid Search for Hyperparameter Tuning
param_grid = {'C': [0.1, 1, 10], 'penalty': ['l2']}
grid_search = GridSearchCV(LogisticRegression(random_state=42), param_grid, cv=5)
grid_search.fit(X_train, y_train)
print("Logistic Regression with Grid Search")
print("Training accuracy:", grid_search.score(X_train, y_train))
print("Validation accuracy:", grid_search.score(X_val, y_val))
print("Test accuracy:", grid_search.score(X_test, y_test))
print("Best parameters:", grid_search.best_params_)
print()

In [None]:
# Step 8: Model Performance Visualization with Confusion Matrix (for best model)
y_pred = lr.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix - Logistic Regression")
plt.show()

In [None]:
# Step 9: Prediction on New Comments
# Predicting sentiments for new comments
new_comments = ["فنان يتغنى في كل مرة بترات المغرب تحياتي"]
new_comments_tfidf = tfidf.transform(new_comments)
predicted_sentiments = lr.predict(new_comments_tfidf)

# Mapping predictions to sentiment labels
label_mapping = {-1: "Negative", 1: "Positive"}
predicted_labels = [label_mapping[sentiment] for sentiment in predicted_sentiments]

# Display results
for comment, label in zip(new_comments, predicted_labels):
    print(f"Comment: {comment}\nPredicted Sentiment: {label}\n")

In [None]:
# Step 10: Save Labeled Comments to CSV
# (Optional) Save new comments with predicted sentiments
comments_df = pd.DataFrame({'Comments': new_comments, 'Sentiment': predicted_labels})
comments_df.to_csv('labeled_comments.csv', index=False)
print("New labeled comments saved to 'labeled_comments.csv'")