In [None]:
import joblib
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
df = pd.read_csv("/content/output2-3.csv")
df.head()

Unnamed: 0,sentence,sentiment,cleaned_tweet
0,دامك مع #غناتي ، فالك طيب 👍,positive,دمك غنت فلك طيب
1,على الفطرة السليمه.. الله يعطيه الصحة والعافية...,positive,فطر سلم الله يعط صحه عفي سكر طبل لحق باب قفل
2,📷 مشجع هلالي ينبذ العنصرية ب لافته أعدها.,positive,شجع هلل نبذ عنصر لفت اعد
3,سبحان الله🌸 الحمدلله 💮 لا اله الا الله 🌿 الله ...,positive,الله حمدلل اله الله الله كبر غفر الله وتب الله...
4,مشاركتي في مبادراتكم الجميلة فوز وسعادة 💞,positive,شرك بدر جمل فوز سعد


# **Pipeline**

In [None]:
import csv

def read_clean_tweets(filename):
    """
    Read a CSV file and extract a column called "clean_tweet" into a list.
    
    Args:
    filename - string representing the name of the CSV file
    
    Returns:
    clean_tweets - list of strings representing clean tweets
    """
    clean_tweets = []

    with open("/content/output2-3.csv", 'r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            clean_tweet = row['cleaned_tweet']
            clean_tweets.append(clean_tweet)
            
    return clean_tweets

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def extract_features(data):
    """
    Extract TF-IDF features from preprocessed text data.
    
    Args:
    data - list of preprocessed text data
    
    Returns:
    feature_matrix - sparse matrix of TF-IDF features
    """
    # Initialize TfidfVectorizer object
    tfidf_vectorizer = TfidfVectorizer(max_df=0.9, min_df=5, ngram_range=(1, 2))

    # Fit and transform data to obtain feature matrix
    feature_matrix = tfidf_vectorizer.fit_transform(data)

    return feature_matrix

In [None]:
import csv

def read_sentiment_labels(filename):
    """
    Read a CSV file and extract a column called "sentiment" into a list.
    
    Args:
    filename - string representing the name of the CSV file
    
    Returns:
    sentiment_labels - list of strings representing sentiment labels
    """
    sentiment_labels = []

    with open(filename, 'r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            sentiment_label = row['sentiment']
            sentiment_labels.append(sentiment_label)

    return sentiment_labels

In [None]:
xx = read_clean_tweets("/content/output2-3.csv")
X = extract_features(xx)
y = read_sentiment_labels("/content/output2-3.csv")
# print(z)

# **Naive Bayes**

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Naive Bayes model on the training data
nb = MultinomialNB()
nb.fit(X_train, y_train)

# Evaluate the performance of the Naive Bayes model on the test data
accuracy = nb.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.7692962301080759


In [None]:
from sklearn.metrics import classification_report

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid to search over
param_grid = {'alpha': [0.1, 0.5, 1.0, 2.0]}

# Create a grid search object
grid = GridSearchCV(estimator=MultinomialNB(), param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the grid search object to the training data
grid.fit(X_train, y_train)

# Print the best hyperparameters and accuracy score
print("Best hyperparameters:", grid.best_params_)
print("Accuracy:", grid.best_score_)
y_pred = nb.predict(X_test)
report = classification_report(y_test, y_pred, target_names=['positive', 'negative'])
print(report)



Best hyperparameters: {'alpha': 0.1}
Accuracy: 0.7641325096696368
              precision    recall  f1-score   support

    positive       0.75      0.79      0.77      5823
    negative       0.79      0.74      0.76      5928

    accuracy                           0.77     11751
   macro avg       0.77      0.77      0.77     11751
weighted avg       0.77      0.77      0.77     11751



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Naive Bayes model on the training data with alpha=0.1
nb = MultinomialNB(alpha=0.1)
nb.fit(X_train, y_train)

# Evaluate the performance of the Naive Bayes model on the test data
accuracy = nb.score(X_test, y_test)
print("Accuracy:", accuracy)
joblib.dump(nb, '/content/drive/MyDrive/nb_model.sav')

Accuracy: 0.7724448983065271


['/content/drive/MyDrive/nb_model.sav']

# **Decision Tree**

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
from sklearn.metrics import precision_recall_fscore_support, classification_report

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid
param_grid = {'criterion': ['gini', 'entropy'], 'max_depth': [None, 5, 10, 20], 'min_samples_split': [2, 5, 10]}

# Create a Decision Tree model
dtc = DecisionTreeClassifier()

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(dtc, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Train a Decision Tree model on the training data using the best hyperparameters
dtc = DecisionTreeClassifier(**best_params)
dtc.fit(X_train, y_train)

# Make predictions on the test data
y_pred = dtc.predict(X_test)

# Calculate precision, recall, and F1-score for each class
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')

# Print the results
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

# Print classification report
report = classification_report(y_test, y_pred, target_names=['positive', 'negative'])
print(report)
print("Best hyperparameters:", grid.best_params_)



Precision: 0.7738435129110838
Recall: 0.773806484554506
F1-score: 0.7738109893253728
              precision    recall  f1-score   support

    positive       0.77      0.78      0.77      5823
    negative       0.78      0.77      0.77      5928

    accuracy                           0.77     11751
   macro avg       0.77      0.77      0.77     11751
weighted avg       0.77      0.77      0.77     11751

Best hyperparameters: {'alpha': 0.1}


In [None]:
print(best_params)

{'criterion': 'gini', 'max_depth': None, 'min_samples_split': 2}


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_recall_fscore_support, classification_report

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Decision Tree model with the specified parameters
dtc = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=5)

# Train the Decision Tree model on the training data
dtc.fit(X_train, y_train)

# Make predictions on the test data
y_pred = dtc.predict(X_test)

# Calculate precision, recall, and F1-score for each class
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

# Print classification report
report = classification_report(y_test, y_pred, target_names=['positive', 'negative'])
print(report)
joblib.dump(dtc, '/content/drive/MyDrive/dtc_model.sav')

Precision: 0.7690694324934731
Recall: 0.7690409326865799
F1-score: 0.7690453584448285
              precision    recall  f1-score   support

    positive       0.77      0.77      0.77      5823
    negative       0.77      0.77      0.77      5928

    accuracy                           0.77     11751
   macro avg       0.77      0.77      0.77     11751
weighted avg       0.77      0.77      0.77     11751



['/content/drive/MyDrive/dtc_model.sav']

# **SVM**

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train an SVM model on the training data
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)

# Evaluate the performance of the SVM model on the test data
accuracy = svm.score(X_test, y_test)
print("Accuracy:", accuracy)
y_pred = svm.predict(X_test)

# Calculate precision, recall, and F1-score for each class
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')

# Print the results
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
y_pred = svm.predict(X_test)

# Calculate precision, recall, and F1-score for each class
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')


report = classification_report(y_test, y_pred, target_names=['positive', 'negative'])
joblib.dump(svm, '/content/drive/MyDrive/svm_model.sav')

Accuracy: 0.7792528295464216
Precision: 0.7798518184812077
Recall: 0.7792528295464216
F1-score: 0.779193600964281


['/content/drive/MyDrive/svm_model.sav']