<a href="https://colab.research.google.com/github/ABDELLAH-Hallou/Sentiment-Analysis/blob/master/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [42]:
import numpy as np
from string import punctuation
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [8]:
nltk.download('stopwords') # Download the stopwords corpus
nltk.download('wordnet') # Download the WordNet corpus

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
def stop_words_removal(corpus):
  """
  Remove stop words from the given corpus.

  Args:
  - corpus (list): List of strings representing text documents.

  Returns:
  - list: List of strings with stop words removed from each document.
  """
  stop_words = stopwords.words('english') # Initializing a list of stop words from the NLTK library
  return [' '.join([word for word in review.split() if word not in stop_words]) for review in corpus]

def punctuation_removal(corpus):
  """
  Remove punctuation from the given corpus.

  Args:
  - corpus (str): Text corpus.

  Returns:
  - str: Corpus with punctuation removed.
  """
  reviews = corpus.lower() # Converting the corpus to lowercase for uniformity
  return ''.join([char for char in reviews if char not in punctuation])

def lemmatization(corpus):
  """
  Perform lemmatization on the given corpus.

  Args:
  - corpus (list): List of strings representing text documents.

  Returns:
  - list: List of strings with lemmatized words for each document.
  """
  lemmatizer = WordNetLemmatizer() # Initializing the WordNet lemmatizer
  return [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]

def load_data(obervations, labels):
  """
  Load text data from the given file paths.

  Args:
  - observations (str): File path to the text data file.
  - labels (str): File path to the label data file.

  Returns:
  - tuple: Two elements containing the loaded text data and labels.
  """
  reviews_path = obervations # Path to the file containing the reviews
  labels_path = labels # Path to the file containing the labels
  with open(reviews_path, 'r') as f :
      reviews = f.read() # Reading the contents of the reviews file
  with open(labels_path, 'r') as f :
      labels = f.read() # Reading the contents of the labels file
  return reviews, labels

def pre_processing(corpus,target):
  """
  Perform preprocessing on the given corpus and target labels.

  Args:
  - corpus (str): Text corpus.
  - target (str): Target labels.

  Returns:
  - tuple: Four elements containing preprocessed reviews, labels, encoded reviews, and encoded labels.
  """
  reviews = punctuation_removal(corpus) # Removing punctuation from the corpus
  reviews = reviews.split('\n') # Splitting the reviews into a list based on newlines
  labels = target.split('\n') # Splitting the target labels into a list based on newlines
  encoded_labels = [1 if label == 'positive' else 0 for label in labels] # Encoding the labels as 1 for 'positive' and 0 for 'negative'
  reviews = lemmatization(stop_words_removal(reviews)) # Applying stop words removal and lemmatization to the reviews
  vectorizer = CountVectorizer(binary=True) # Initializing a binary CountVectorizer
  encoded_reviews = vectorizer.fit_transform(reviews) # Transforming the pre-processed reviews into encoded features
  return reviews,labels,encoded_reviews,encoded_labels

In [10]:
# Load data from file
reviews, labels = load_data('/content/reviews.txt', '/content/labels.txt')
# Perform pre-processing on the loaded data
reviews,labels,encoded_reviews,encoded_labels = pre_processing(reviews,labels)

In [11]:
param_grid = {
    'C': [0.25, 0.5, 1],
    'gamma': [0.001, 0.1, 1],
    'kernel': ['linear', 'poly', 'rbf']
}

In [27]:
# Split the reviews and labels into training and testing sets
# with a test size of 30% and a random state of 42
X_train, X_test, y_train, y_test = train_test_split(encoded_reviews, encoded_labels, test_size = 0.3, random_state=80)

In [13]:
# Perform grid search to find the best hyperparameters
model = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3,  n_jobs=2)
model.fit(X_train, y_train)
# print best parameter after tuning
print(model.best_params_)
# print how our model looks after hyper-parameter tuning
print(model.best_estimator_)
# Evaluate on the validation set
y_pred = model.predict(X_test)
score = accuracy_score(y_test, y_pred)
print(f"accuracy : {score}")
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 27 candidates, totalling 135 fits
{'C': 0.25, 'gamma': 0.001, 'kernel': 'linear'}
SVC(C=0.25, gamma=0.001, kernel='linear')
accuracy : 0.8653333333333333
              precision    recall  f1-score   support

           0       0.87      0.87      0.87       757
           1       0.87      0.86      0.86       743

    accuracy                           0.87      1500
   macro avg       0.87      0.87      0.87      1500
weighted avg       0.87      0.87      0.87      1500



In [43]:
# Create a list of models
models = [
    ('NB', GaussianNB()),
    ('Logistic Regression', LogisticRegression(C=0.25)),
    ('Decision Tree', DecisionTreeClassifier()),
    ('KNN', KNeighborsClassifier(
        n_neighbors=3,
        metric = "euclidean"
    )),
    ('QDA', QuadraticDiscriminantAnalysis()),
    ('LDA', LinearDiscriminantAnalysis()),
    ("Random Forest", RandomForestClassifier()),
    ("Neural Net", MLPClassifier()),
    ("AdaBoost", AdaBoostClassifier())
]

In [45]:
for name, model in models:
    clf = model
    if name in ["LDA","NB","QDA"]:
      clf.fit(X_train.toarray(), y_train)
      y_pred = clf.predict(X_test.toarray())
      accuracy = clf.score(X_test.toarray(), y_test)
    else:
      clf.fit(X_train, y_train)
      y_pred = clf.predict(X_test)
      accuracy = clf.score(X_test, y_test)
    print("TEST Accuracy", name, accuracy)
    cm = classification_report(y_test, y_pred)
    print("Report for", name, ":\n", cm)

TEST Accuracy NB 0.7573333333333333
Report for NB :
               precision    recall  f1-score   support

           0       0.73      0.83      0.78       757
           1       0.80      0.68      0.74       743

    accuracy                           0.76      1500
   macro avg       0.76      0.76      0.76      1500
weighted avg       0.76      0.76      0.76      1500

TEST Accuracy Logistic Regression 0.88
Report for Logistic Regression :
               precision    recall  f1-score   support

           0       0.88      0.88      0.88       757
           1       0.88      0.88      0.88       743

    accuracy                           0.88      1500
   macro avg       0.88      0.88      0.88      1500
weighted avg       0.88      0.88      0.88      1500

TEST Accuracy Decision Tree 0.714
Report for Decision Tree :
               precision    recall  f1-score   support

           0       0.72      0.72      0.72       757
           1       0.71      0.71      0.71      



TEST Accuracy QDA 0.49666666666666665
Report for QDA :
               precision    recall  f1-score   support

           0       1.00      0.00      0.01       757
           1       0.50      1.00      0.66       743

    accuracy                           0.50      1500
   macro avg       0.75      0.50      0.33      1500
weighted avg       0.75      0.50      0.33      1500

TEST Accuracy LDA 0.6426666666666667
Report for LDA :
               precision    recall  f1-score   support

           0       0.66      0.59      0.63       757
           1       0.63      0.69      0.66       743

    accuracy                           0.64      1500
   macro avg       0.64      0.64      0.64      1500
weighted avg       0.64      0.64      0.64      1500

TEST Accuracy Random Forest 0.8453333333333334
Report for Random Forest :
               precision    recall  f1-score   support

           0       0.85      0.84      0.85       757
           1       0.84      0.85      0.85       7

Model|Accuracy|Precision|Recall|F1-Score
--- | --- | --- | --- | ---
Neural Net|0.905|0.91|0.90|0.90
Logistic Regression|0.880|0.88|0.88|0.88
Random Forest|0.845|0.85|0.84|0.85
SVM|0.865|0.87|0.87|0.87
AdaBoost|0.780|0.80|0.76|0.78
Decision Tree|0.714|0.72|0.72|0.72
Linear Discriminant|0.643|0.66|0.59|0.63
Naive Bayes|0.757|0.73|0.83|0.78
KNN|0.560|0.54|0.91|0.68
Quadratic Discriminant|0.496|1.00|0.00|0.01
