<a href="https://colab.research.google.com/github/ABDELLAH-Hallou/Sentiment-Analysis/blob/master/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
from string import punctuation
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [6]:
nltk.download('stopwords') # Download the stopwords corpus
nltk.download('wordnet') # Download the WordNet corpus

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
def stop_words_removal(corpus):
  """
  Remove stop words from the given corpus.

  Args:
  - corpus (list): List of strings representing text documents.

  Returns:
  - list: List of strings with stop words removed from each document.
  """
  stop_words = stopwords.words('english') # Initializing a list of stop words from the NLTK library
  return [' '.join([word for word in review.split() if word not in stop_words]) for review in corpus]

def punctuation_removal(corpus):
  """
  Remove punctuation from the given corpus.

  Args:
  - corpus (str): Text corpus.

  Returns:
  - str: Corpus with punctuation removed.
  """
  reviews = corpus.lower() # Converting the corpus to lowercase for uniformity
  return ''.join([char for char in reviews if char not in punctuation])

def lemmatization(corpus):
  """
  Perform lemmatization on the given corpus.

  Args:
  - corpus (list): List of strings representing text documents.

  Returns:
  - list: List of strings with lemmatized words for each document.
  """
  lemmatizer = WordNetLemmatizer() # Initializing the WordNet lemmatizer
  return [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]

def load_data(obervations, labels):
  """
  Load text data from the given file paths.

  Args:
  - observations (str): File path to the text data file.
  - labels (str): File path to the label data file.

  Returns:
  - tuple: Two elements containing the loaded text data and labels.
  """
  reviews_path = obervations # Path to the file containing the reviews
  labels_path = labels # Path to the file containing the labels
  with open(reviews_path, 'r') as f :
      reviews = f.read() # Reading the contents of the reviews file
  with open(labels_path, 'r') as f :
      labels = f.read() # Reading the contents of the labels file
  return reviews, labels

def pre_processing(corpus,target):
  """
  Perform preprocessing on the given corpus and target labels.

  Args:
  - corpus (str): Text corpus.
  - target (str): Target labels.

  Returns:
  - tuple: Four elements containing preprocessed reviews, labels, encoded reviews, and encoded labels.
  """
  reviews = punctuation_removal(corpus) # Removing punctuation from the corpus
  reviews = reviews.split('\n') # Splitting the reviews into a list based on newlines
  labels = target.split('\n') # Splitting the target labels into a list based on newlines
  encoded_labels = [1 if label == 'positive' else 0 for label in labels] # Encoding the labels as 1 for 'positive' and 0 for 'negative'
  reviews = lemmatization(stop_words_removal(reviews)) # Applying stop words removal and lemmatization to the reviews
  vectorizer = CountVectorizer(binary=True) # Initializing a binary CountVectorizer
  encoded_reviews = vectorizer.fit_transform(reviews) # Transforming the pre-processed reviews into encoded features
  return reviews,labels,encoded_reviews,encoded_labels

In [7]:
# Load data from file
reviews, labels = load_data('/content/reviews.txt', '/content/labels.txt')
# Perform pre-processing on the loaded data
reviews,labels,encoded_reviews,encoded_labels = pre_processing(reviews,labels)

In [8]:
svm_types = ['linear', 'poly', 'rbf']
param_grid = {
    'C': [0.1,0.25,0.5,0.75, 1, 10],
    'gamma': [0.1, 1, 10]
}

In [None]:
# Split the reviews and labels into training and testing sets
# with a test size of 30% and a random state of 42
X_train, X_test, y_train, y_test = train_test_split(encoded_reviews, encoded_labels, test_size = 0.3, random_state=80)

best_score = 0
best_svm = None
for svm_type in svm_types:
  # Create the SVM model for the current type
  svm = SVC(kernel=svm_type)

  # Perform grid search to find the best hyperparameters
  model = GridSearchCV(svm, param_grid, cv=5, n_jobs=2)
  model.fit(X_train, y_train)

  # Evaluate on the validation set
  y_pred = model.predict(X_test)
  score = accuracy_score(y_test, y_pred)
  print(f"accuracy : {score}, svm type : {svm_type}")
  # Check if this model has the best score
  if score > best_score:
    best_score = score
    best_svm = model.best_estimator_




accuracy : 0.8706666666666667, svm type : linear
accuracy : 0.7493333333333333, svm type : poly
accuracy : 0.64, svm type : rbf
