In [None]:
!pip install urlextract
!pip install emojis
!pip install hazm
!pip install imbalanced-learn



# Installing modules

In [None]:
from sklearn.linear_model import LogisticRegression
from __future__ import unicode_literals
from hazm import *
from pickle import TRUE
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
import re
from urlextract import URLExtract
import emojis
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from tqdm import tqdm
from gensim.models import Word2Vec
from hazm import stopwords_list
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from gensim.models import FastText
from nltk.tokenize import word_tokenize
from imblearn.over_sampling import RandomOverSampler
from hazm import stopwords_list

# import data

## import train dataset

In [None]:
file_path = '/content/train_data.xlsx'
data = pd.read_excel(file_path, header=None)

data.columns = ['Text', 'Emotion']
print(data.head())

                                                Text Emotion
0  کی گفته مرد گریه نمیکنه!؟!؟ سیلم امشب سیل #اصفهان     SAD
1  عکسی که چند روز پیش گذاشته بودم این فیلم الانش...   OTHER
2  تنهاییم شبیه تنهاییه ظهرای بچگیم شده وقتی که ه...     SAD
3           خوبه تمام قسمت‌های گوشی رو محافظت می‌کنه   HAPPY
4  این خاک مال مردمان است نه حاکمان #ایران #مهسا_...   ANGRY


## import test dataset

In [None]:
file_path = '/content/3rdHW_test.csv'
test = pd.read_csv(file_path, header=None)

test.columns = ['Text']
print(test.head())

                                                Text
0               صعب روزی، بوالعجب کاری، پریشان عالمی
1         بسیار نرم و لطیف بوده و کیفیت بالایی داره.
2      اصلا رنگش با چیزی که تو عکس بود خیلی فرق داشت
3            خیلی زیبا و ب اندازه و با دقت طراحی شده
4  سبزی پلو با ماهی مال عید نوروزه، امشب سوشی میخ...


# Data cleaning


## This code is a set of functions designed for preprocessing Persian texts. It includes converting Persian numbers to English, converting Arabic characters to their Persian equivalents, removing URLs, replacing emojis and emoticons with appropriate tags, and applying other preprocessing filters.

### Detailed Explanation of Each Function and Its Operation:

1. **Function `_multiple_replace(mapping, text)`**:
    - This function takes a dictionary `mapping` of replacements and a text `text`.
    
2. **Function `convert_fa_numbers(input_str)`**:
    - This function converts Persian numbers to their English equivalents.

3. **Function `convert_ar_characters(input_str)`**:
    - This function converts Arabic characters to their Persian equivalents.

4. **Function `preprocess(text)`**:
    - This function preprocesses the input text and includes several stages:
      - **Removing URLs**: It uses `URLExtract` to identify all URLs in the text and replaces them with `<URL>`.
      - **Replacing Emojis**: Identifies emojis and replaces them with `<emoji>`.
      - **Converting Persian Numbers**: Calls `convert_fa_numbers` to convert Persian numbers.
      - **Converting Arabic Characters**: Calls `convert_ar_characters` to convert Arabic characters.
      - **Replacing Emoticons**: Uses a regex pattern to replace all emoticons in the text with `<smiley>`.
      - **Lowercasing**: Converts all characters to lowercase to avoid changes in URLs and emoticons.
      - **Cleaning Special Characters**: Replaces specific special characters like <>#.:()"\'!?؟،,@$%^&*_+\[\]/ with spaces.
      - **Removing Extra Spaces**: Converts multiple spaces to a single space.
      - **Removing Repeated Characters**: Reduces characters repeated more than twice to a single occurrence.
      - **Checking for Persian Characters**: If Persian characters are present in the text, it returns the processed text; otherwise, it returns 'None'.



## During the test of the model in different conditions, I realized that removing the numbers can have at least a small effect on the accuracy of the model, so we perform the data cleaning process once without the presence of numbers.

In [None]:
def _multiple_replace(mapping, text):
    pattern = "|".join(map(re.escape, mapping.keys()))
    return re.sub(pattern, lambda m: mapping[m.group()], str(text))

def convert_fa_numbers(input_str):
    mapping = {
        '۰': '0',
        '۱': '1',
        '۲': '2',
        '۳': '3',
        '۴': '4',
        '۵': '5',
        '۶': '6',
        '۷': '7',
        '۸': '8',
        '۹': '9',
        '.': '.',
    }
    return _multiple_replace(mapping, input_str)

def convert_ar_characters(input_str):
    """
    Converts Arabic chars to related Persian unicode char
    :param input_str: String contains Arabic chars
    :return: New str with converted arabic chars
    """
    mapping = {
        'ك': 'ک',
        'ى': 'ی',
        'ي': 'ی',
        'ئ':'ی',
        'إ':'ا',
        'أ':'ا',
        'ة':'ه',
        'ؤ':'و'
    }
    return _multiple_replace(mapping, input_str)

def preprocess(text):
    extractor = URLExtract()
    for url in extractor.gen_urls(text):
        text = text.replace(url, '<URL>')
    emj = emojis.get(text)
    for i in emj:
        if i in text:
            text = text.replace(i, '<emoji>')

    text_with_numbers = convert_fa_numbers(text)
    text_with_numbers = convert_ar_characters(text_with_numbers)

    text_without_numbers = re.sub(r'\d+', '', text_with_numbers)

    # regex to detect and replace all smilies in the text with <smiley>
    pattern_smiley = r"(:\s?\)|:-\)|\(\s?:|\(-:|:\'\)|:\s?D|8-\)|:\s?\||;\s?\)|:-\*|:-\||:-\(|:\s?P|:-P|:-p|:-b|:-O|:-o|:-0|:-@|:\$|:-\^|:-&|:-\*|:-\+|:-~|:-`|:-\>|:-\<|:-\}|:-\{|\[:\s?\]|\[:\s?\]|:\s?\]|:\s?\[|:\s?\}|:\s?\{)"
    text_with_numbers = re.sub(pattern_smiley, '<smiley>', text_with_numbers)
    text_without_numbers = re.sub(pattern_smiley, '<smiley>', text_without_numbers)

    text_with_numbers = text_with_numbers.lower().strip()
    text_without_numbers = text_without_numbers.lower().strip()

    text_with_numbers = re.sub(r'[<>#.:()"\'!?؟،,@$%^&*_+\[\]/]', ' ', text_with_numbers)
    text_without_numbers = re.sub(r'[<>#.:()"\'!?؟،,@$%^&*_+\[\]/]', ' ', text_without_numbers)

    text_with_numbers = re.sub(r'[\s]{2,}', ' ', re.sub(r'(\w)\1{2,}', r'\1', text_with_numbers))
    text_without_numbers = re.sub(r'[\s]{2,}', ' ', re.sub(r'(\w)\1{2,}', r'\1', text_without_numbers))

    if re.search(r'[\u0600-\u06FF]', text_with_numbers):
        return text_with_numbers, text_without_numbers
    else:
        return 'None', 'None'

## The output of this function is text with and without numbers

In [None]:
tqdm.pandas()

In [None]:
data[['Cleaned_With_Numbers', 'Cleaned_Without_Numbers']] = data['Text'].progress_apply(preprocess).apply(pd.Series)


100%|██████████| 4924/4924 [02:11<00:00, 37.45it/s]


In [None]:
data.head(5)

Unnamed: 0,Text,Emotion,Cleaned_With_Numbers,Cleaned_Without_Numbers
0,کی گفته مرد گریه نمیکنه!؟!؟ سیلم امشب سیل #اصفهان,SAD,کی گفته مرد گریه نمیکنه سیلم امشب سیل اصفهان,کی گفته مرد گریه نمیکنه سیلم امشب سیل اصفهان
1,عکسی که چند روز پیش گذاشته بودم این فیلم الانش...,OTHER,عکسی که چند روز پیش گذاشته بودم این فیلم الانش...,عکسی که چند روز پیش گذاشته بودم این فیلم الانش...
2,تنهاییم شبیه تنهاییه ظهرای بچگیم شده وقتی که ه...,SAD,تنهاییم شبیه تنهاییه ظهرای بچگیم شده وقتی که ه...,تنهاییم شبیه تنهاییه ظهرای بچگیم شده وقتی که ه...
3,خوبه تمام قسمت‌های گوشی رو محافظت می‌کنه,HAPPY,خوبه تمام قسمت‌های گوشی رو محافظت می‌کنه,خوبه تمام قسمت‌های گوشی رو محافظت می‌کنه
4,این خاک مال مردمان است نه حاکمان #ایران #مهسا_...,ANGRY,این خاک مال مردمان است نه حاکمان ایران مهسا امینی,این خاک مال مردمان است نه حاکمان ایران مهسا امینی


##The code converts categorical emotion labels into numeric labels and adds these numeric labels as a new column in the DataFrame.

In [None]:
label_encoder = LabelEncoder()
data['label_encoded'] = label_encoder.fit_transform(data['Emotion'])
data.head()


Unnamed: 0,Text,Emotion,Cleaned_With_Numbers,Cleaned_Without_Numbers,label_encoded
0,کی گفته مرد گریه نمیکنه!؟!؟ سیلم امشب سیل #اصفهان,SAD,کی گفته مرد گریه نمیکنه سیلم امشب سیل اصفهان,کی گفته مرد گریه نمیکنه سیلم امشب سیل اصفهان,4
1,عکسی که چند روز پیش گذاشته بودم این فیلم الانش...,OTHER,عکسی که چند روز پیش گذاشته بودم این فیلم الانش...,عکسی که چند روز پیش گذاشته بودم این فیلم الانش...,3
2,تنهاییم شبیه تنهاییه ظهرای بچگیم شده وقتی که ه...,SAD,تنهاییم شبیه تنهاییه ظهرای بچگیم شده وقتی که ه...,تنهاییم شبیه تنهاییه ظهرای بچگیم شده وقتی که ه...,4
3,خوبه تمام قسمت‌های گوشی رو محافظت می‌کنه,HAPPY,خوبه تمام قسمت‌های گوشی رو محافظت می‌کنه,خوبه تمام قسمت‌های گوشی رو محافظت می‌کنه,2
4,این خاک مال مردمان است نه حاکمان #ایران #مهسا_...,ANGRY,این خاک مال مردمان است نه حاکمان ایران مهسا امینی,این خاک مال مردمان است نه حاکمان ایران مهسا امینی,0


In [None]:
# Display the unique labels and their corresponding encoded values
unique_labels = data[['Emotion', 'label_encoded']].drop_duplicates().sort_values(by='Emotion')
unique_labels


Unnamed: 0,Emotion,label_encoded
4,ANGRY,0
10,FEAR,1
3,HAPPY,2
1,OTHER,3
0,SAD,4


## Considering the frequency of each class, we realize that our classes are imbalanced

In [None]:
label_counts = data['Emotion'].value_counts(); label_counts

Emotion
HAPPY    1462
OTHER    1263
ANGRY    1012
SAD       845
FEAR      342
Name: count, dtype: int64

#It should be noted that in order to prevent data leakage, it is necessary to split the data and then perform normalization and vectorization on the train data and then transform it on the test data.

## In order to check whether the presence of numbers or their absence affects the accuracy of the model or not, we split the data from both features produced separately.

In [None]:
X_train_with_num,X_test_with_num, y_train, y_test = train_test_split(data['Cleaned_With_Numbers'], data['label_encoded'], test_size=0.2, random_state=42)

In [None]:
X_train_without_num, X_test_without_num, y_train, y_test = train_test_split(data['Cleaned_Without_Numbers'], data['label_encoded'], test_size=0.2, random_state=42)

##Normalization along with stopword removal and with numbers

In [None]:
# Custom transformer for normalization and stopword removal
class NormalizerAndStopwordRemover2(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.normalizer = hazm.Normalizer()
        self.stopwords = set(stopwords_list())

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.apply(self._normalize_and_remove_stopwords)

    def _normalize_and_remove_stopwords(self, text):
        normalized_text = self.normalizer.normalize(text)
        tokens = hazm.word_tokenize(normalized_text)
        filtered_tokens = [token for token in tokens if token not in self.stopwords]
        return ' '.join(filtered_tokens)

# Initialize the transformer
transformer = NormalizerAndStopwordRemover2()

# Fit and transform the training data
X_train_normalized_without_stopword_number= transformer.fit_transform(X_train_with_num)

# Transform the test data
X_test_normalized_without_stopword_number= transformer.transform(X_test_with_num)

# Optional: Combine the normalized data back into the original DataFrame if needed
data['Cleaned_normalized_remove_stopword_num'] = data['Cleaned_With_Numbers']
data.loc[X_train_with_num.index, 'Cleaned_normalized_remove_stopword_num'] = X_train_normalized_without_stopword_number
data.loc[X_test_with_num.index, 'Cleaned_normalized_remove_stopword_num'] = X_test_normalized_without_stopword_number

print("\nUpdated DataFrame:")
data.head(5)


Updated DataFrame:


Unnamed: 0,Text,Emotion,Cleaned_With_Numbers,Cleaned_Without_Numbers,label_encoded,Cleaned_normalized_remove_stopword_num,Cleaned_normalized_remove_stopword_no_num,Cleaned_normalized_with_stopword
0,کی گفته مرد گریه نمیکنه!؟!؟ سیلم امشب سیل #اصفهان,SAD,کی گفته مرد گریه نمیکنه سیلم امشب سیل اصفهان,کی گفته مرد گریه نمیکنه سیلم امشب سیل اصفهان,4,کی مرد گریه نمیکنه سیلم امشب سیل اصفهان,کی مرد گریه نمیکنه سیلم امشب سیل اصفهان,کی گفته مرد گریه نمیکنه سیلم امشب سیل اصفهان
1,عکسی که چند روز پیش گذاشته بودم این فیلم الانش...,OTHER,عکسی که چند روز پیش گذاشته بودم این فیلم الانش...,عکسی که چند روز پیش گذاشته بودم این فیلم الانش...,3,عکسی روز گذاشته_بودم فیلم الانشه وسط کوه‌ها لا...,عکسی روز گذاشته_بودم فیلم الانشه وسط کوه‌ها لا...,عکسی که چند روز پیش گذاشته_بودم این فیلم الانش...
2,تنهاییم شبیه تنهاییه ظهرای بچگیم شده وقتی که ه...,SAD,تنهاییم شبیه تنهاییه ظهرای بچگیم شده وقتی که ه...,تنهاییم شبیه تنهاییه ظهرای بچگیم شده وقتی که ه...,4,تنهاییم شبیه تنهاییه ظهرای بچگیم وقتی‌که می‌خو...,تنهاییم شبیه تنهاییه ظهرای بچگیم وقتی‌که می‌خو...,تنهاییم شبیه تنهاییه ظهرای بچگیم شده وقتی‌که ه...
3,خوبه تمام قسمت‌های گوشی رو محافظت می‌کنه,HAPPY,خوبه تمام قسمت‌های گوشی رو محافظت می‌کنه,خوبه تمام قسمت‌های گوشی رو محافظت می‌کنه,2,خوبه قسمت‌های گوشی محافظت می‌کنه,خوبه قسمت‌های گوشی محافظت می‌کنه,خوبه تمام قسمت‌های گوشی رو محافظت می‌کنه
4,این خاک مال مردمان است نه حاکمان #ایران #مهسا_...,ANGRY,این خاک مال مردمان است نه حاکمان ایران مهسا امینی,این خاک مال مردمان است نه حاکمان ایران مهسا امینی,0,خاک‌مال مردمان حاکمان ایران مهسا امینی,خاک‌مال مردمان حاکمان ایران مهسا امینی,این خاک‌مال مردمان است نه حاکمان ایران مهسا امینی


## Normalization along with stopword removal and without numbers

In [None]:
# Custom transformer for normalization and stopword removal
class NormalizerAndStopwordRemover(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.normalizer = hazm.Normalizer()
        self.stopwords = set(stopwords_list())

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.apply(self._normalize_and_remove_stopwords)

    def _normalize_and_remove_stopwords(self, text):
        normalized_text = self.normalizer.normalize(text)
        tokens = hazm.word_tokenize(normalized_text)
        filtered_tokens = [token for token in tokens if token not in self.stopwords]
        return ' '.join(filtered_tokens)

# Initialize the transformer
transformer = NormalizerAndStopwordRemover()

# Fit and transform the training data
X_train_normalized_without_stopword_no_number= transformer.fit_transform(X_train_without_num)

# Transform the test data
X_test_normalized_without_stopword_no_number= transformer.transform(X_test_without_num)

# Optional: Combine the normalized data back into the original DataFrame if needed
data['Cleaned_normalized_remove_stopword_no_num'] = data['Cleaned_Without_Numbers']
data.loc[X_train_with_num.index, 'Cleaned_normalized_remove_stopword_no_num'] = X_train_normalized_without_stopword_no_number
data.loc[X_test_with_num.index, 'Cleaned_normalized_remove_stopword_no_num'] = X_test_normalized_without_stopword_no_number

print("\nUpdated DataFrame:")
data.head(5)


Updated DataFrame:


Unnamed: 0,Text,Emotion,Cleaned_With_Numbers,Cleaned_Without_Numbers,label_encoded,Cleaned_normalized_remove_stopword_no_num
0,کی گفته مرد گریه نمیکنه!؟!؟ سیلم امشب سیل #اصفهان,SAD,کی گفته مرد گریه نمیکنه سیلم امشب سیل اصفهان,کی گفته مرد گریه نمیکنه سیلم امشب سیل اصفهان,4,کی مرد گریه نمیکنه سیلم امشب سیل اصفهان
1,عکسی که چند روز پیش گذاشته بودم این فیلم الانش...,OTHER,عکسی که چند روز پیش گذاشته بودم این فیلم الانش...,عکسی که چند روز پیش گذاشته بودم این فیلم الانش...,3,عکسی روز گذاشته_بودم فیلم الانشه وسط کوه‌ها لا...
2,تنهاییم شبیه تنهاییه ظهرای بچگیم شده وقتی که ه...,SAD,تنهاییم شبیه تنهاییه ظهرای بچگیم شده وقتی که ه...,تنهاییم شبیه تنهاییه ظهرای بچگیم شده وقتی که ه...,4,تنهاییم شبیه تنهاییه ظهرای بچگیم وقتی‌که می‌خو...
3,خوبه تمام قسمت‌های گوشی رو محافظت می‌کنه,HAPPY,خوبه تمام قسمت‌های گوشی رو محافظت می‌کنه,خوبه تمام قسمت‌های گوشی رو محافظت می‌کنه,2,خوبه قسمت‌های گوشی محافظت می‌کنه
4,این خاک مال مردمان است نه حاکمان #ایران #مهسا_...,ANGRY,این خاک مال مردمان است نه حاکمان ایران مهسا امینی,این خاک مال مردمان است نه حاکمان ایران مهسا امینی,0,خاک‌مال مردمان حاکمان ایران مهسا امینی


## Normalization with stopword and number

In [None]:
# Custom transformer for normalization without stopword removal
class NormalizerWithoutStopwordRemoval3(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.normalizer = hazm.Normalizer()
        self.stopwords = set(stopwords_list())

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.apply(self._normalize_without_stopwords)

    def _normalize_without_stopwords(self, text):
        normalized_text = self.normalizer.normalize(text)
        tokens = hazm.word_tokenize(normalized_text)
        return ' '.join(tokens)

# Initialize the transformer
transformer = NormalizerWithoutStopwordRemoval3()

# Fit and transform the training data
X_train_normalized_with_stopword = transformer.fit_transform(X_train_with_num)

# Transform the test data
X_test_normalized_with_stopword = transformer.transform(X_test_with_num)

# Optional: Combine the normalized data back into the original DataFrame if needed
data['Cleaned_normalized_with_stopword'] = data['Cleaned_With_Numbers']
data.loc[X_train_with_num.index, 'Cleaned_normalized_with_stopword'] = X_train_normalized_with_stopword
data.loc[X_test_with_num.index, 'Cleaned_normalized_with_stopword'] = X_test_normalized_with_stopword

print("\nUpdated DataFrame:")
data.head(5)


Updated DataFrame:


Unnamed: 0,Text,Emotion,Cleaned_With_Numbers,Cleaned_Without_Numbers,label_encoded,Cleaned_normalized_remove_stopword_num,Cleaned_normalized_remove_stopword_no_num,Cleaned_normalized_with_stopword
0,کی گفته مرد گریه نمیکنه!؟!؟ سیلم امشب سیل #اصفهان,SAD,کی گفته مرد گریه نمیکنه سیلم امشب سیل اصفهان,کی گفته مرد گریه نمیکنه سیلم امشب سیل اصفهان,4,کی مرد گریه نمیکنه سیلم امشب سیل اصفهان,کی مرد گریه نمیکنه سیلم امشب سیل اصفهان,کی گفته مرد گریه نمیکنه سیلم امشب سیل اصفهان
1,عکسی که چند روز پیش گذاشته بودم این فیلم الانش...,OTHER,عکسی که چند روز پیش گذاشته بودم این فیلم الانش...,عکسی که چند روز پیش گذاشته بودم این فیلم الانش...,3,عکسی روز گذاشته_بودم فیلم الانشه وسط کوه‌ها لا...,عکسی روز گذاشته_بودم فیلم الانشه وسط کوه‌ها لا...,عکسی که چند روز پیش گذاشته_بودم این فیلم الانش...
2,تنهاییم شبیه تنهاییه ظهرای بچگیم شده وقتی که ه...,SAD,تنهاییم شبیه تنهاییه ظهرای بچگیم شده وقتی که ه...,تنهاییم شبیه تنهاییه ظهرای بچگیم شده وقتی که ه...,4,تنهاییم شبیه تنهاییه ظهرای بچگیم وقتی‌که می‌خو...,تنهاییم شبیه تنهاییه ظهرای بچگیم وقتی‌که می‌خو...,تنهاییم شبیه تنهاییه ظهرای بچگیم شده وقتی‌که ه...
3,خوبه تمام قسمت‌های گوشی رو محافظت می‌کنه,HAPPY,خوبه تمام قسمت‌های گوشی رو محافظت می‌کنه,خوبه تمام قسمت‌های گوشی رو محافظت می‌کنه,2,خوبه قسمت‌های گوشی محافظت می‌کنه,خوبه قسمت‌های گوشی محافظت می‌کنه,خوبه تمام قسمت‌های گوشی رو محافظت می‌کنه
4,این خاک مال مردمان است نه حاکمان #ایران #مهسا_...,ANGRY,این خاک مال مردمان است نه حاکمان ایران مهسا امینی,این خاک مال مردمان است نه حاکمان ایران مهسا امینی,0,خاک‌مال مردمان حاکمان ایران مهسا امینی,خاک‌مال مردمان حاکمان ایران مهسا امینی,این خاک‌مال مردمان است نه حاکمان ایران مهسا امینی


### The input text has been cleaned with different modes, including without or with stopword and the presence and absence of numbers, but finally, special attention is paid to the mode where numbers and stopwords have been removed from the text.

#Vectorization methods

##1. CountVectorizer(BOW) on data that has been normalized and its stopword has been removed

In [None]:
count_vectorizer1 = CountVectorizer(max_df= 4, binary=False, ngram_range=(1,4), max_features= 1000)
X_train_count_vec_without_S = count_vectorizer1.fit_transform(X_train_normalized_without_stopword_no_number).todense()
X_test_count_vec_without_S = count_vectorizer1.transform(X_test_normalized_without_stopword_no_number).todense()
pd.DataFrame(X_train_count_vec_without_S , columns=count_vectorizer1.get_feature_names_out()).head(5)

Unnamed: 0,lrgcterrorists,lt,lt lt,lt lt lt,آب برق,آب قطع,آرامش,آزادی ایران,آسان,آسمان,...,گوشیه خوبیه,گوشیو,گول,گوگل,گیریم,یادت بره,یارو,یه آدم,یه روز,یه نفر
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


##1. CountVectorizer(BOW) on data that has been just normalized

In [None]:
count_vectorizer2 = CountVectorizer(max_df= 4, binary=False, ngram_range=(1,4), max_features= 1000)
X_train_count_vect = count_vectorizer2.fit_transform(X_train_normalized_with_stopword).todense()
X_test_count_vect = count_vectorizer2.transform(X_test_normalized_with_stopword).todense()
pd.DataFrame(X_train_count_vect , columns=count_vectorizer2.get_feature_names_out()).head(5)

Unnamed: 0,lrgcterrorists,lt,lt lt,lt lt lt,آب برق,آرامش,آسان,آلودگی هوا,آمدند,آمدند اعتراضات,...,۱۹,۲۰۰,۲۲,۲۴,۲۵ دی,۳۰۰,۴۴,۵۰۰,۶۰,۸۰
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


##2. TfidfVectorizer on data that has been normalized and its stopword has been removed

In [None]:
vectorizer1 = TfidfVectorizer(min_df=2, max_features= 1000, ngram_range=(1, 4))
X_train_tfidf_without_S = vectorizer1.fit_transform(X_train_normalized_without_stopword_number).todense()
X_test_tfidf_without_S = vectorizer1.transform(X_test_normalized_without_stopword_number).todense()
pd.DataFrame(X_train_tfidf_without_S, columns=vectorizer1.get_feature_names_out()).head(5)

Unnamed: 0,emoji,emoji emoji,emoji emoji emoji,emoji مهسا,emoji مهسا امینی,gt,gt gt,gt gt gt,gt gt gt gt,hero,...,۲۰۲۲,۲۰۲۳,۲۵,۲۶,۲۹,۲۹ ۳۰,۲۹ ۳۰ دی,۳۰,۳۰ دی,۵۷
0,0.293266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.701045,0.525784,0.350523,0.175261,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 2. TfidfVectorizer on data that has been normalized and its stopword has been removed AND WITHOUT number

In [None]:
vectorizer2 = TfidfVectorizer(min_df=2, max_features= 1000, ngram_range=(1, 4))
X_train_tfidf_without_S_and_N = vectorizer2.fit_transform(X_train_normalized_without_stopword_no_number).todense()
X_test_tfidf_without_S_and_N = vectorizer2.transform(X_test_normalized_without_stopword_no_number).todense()
pd.DataFrame(X_train_tfidf_without_S_and_N, columns=vectorizer2.get_feature_names_out()).head(5)

Unnamed: 0,emoji,emoji emoji,emoji emoji emoji,emoji مهسا,emoji مهسا امینی,gt,gt gt,gt gt gt,gt gt gt gt,hero,...,گل,گوشه,گوشی,گیر,گیرم,یاد,یخ,یه,یهو,یک
0,0.266317,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.701045,0.525784,0.350523,0.175261,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


##2. TfidfVectorizer on data that has been just normalized

In [None]:
vectorizer3 = TfidfVectorizer(min_df=2, max_features= 1000, ngram_range=(1, 4))
X_train_tfidf_with_S = vectorizer3.fit_transform(X_train_normalized_with_stopword).todense()
X_test_tfidf_with_S= vectorizer3.transform(X_test_normalized_with_stopword).todense()
pd.DataFrame(X_train_tfidf_with_S, columns=vectorizer3.get_feature_names_out()).head(5)

Unnamed: 0,emoji,emoji emoji,emoji مهسا,emoji مهسا امینی,gt,gt gt,gt gt gt,hero,irgcterorrists,irgcterrorist,...,۲۰۲۰,۲۰۲۱,۲۰۲۲,۲۰۲۳,۲۵,۲۹,۲۹ ۳۰,۲۹ ۳۰ دی,۳۰,۳۰ دی
0,0.328638,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.696993,0.522744,0.348496,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


##3. FastTextVectorizer

In [None]:
model1 = FastText(sentences=X_train_normalized_without_stopword_no_number, vector_size=200, window=6, min_count=2, workers=4, sg=1)
model2 = FastText(sentences=X_test_normalized_without_stopword_no_number, vector_size=200, window=6, min_count=2, workers=4, sg=1)

def sentence_vectorizer(sentence, model):
    words = [word for word in sentence if word in model.wv]
    if len(words) == 0:
        return np.zeros(model.vector_size)
    return np.mean(model.wv[words], axis=0)

X_train_fasttext_vectorized = np.array([sentence_vectorizer(sentence, model1) for sentence in X_train_normalized_without_stopword_no_number])
X_test_fasttext_vectorized = np.array([sentence_vectorizer(sentence, model2) for sentence in X_test_normalized_without_stopword_no_number])

df_train_fasttext = pd.DataFrame(X_train_fasttext_vectorized)
df_test_fasttext = pd.DataFrame(X_test_fasttext_vectorized)



##4. Word2Vec vectorizer

In [None]:
model1 = Word2Vec(sentences=X_train_normalized_without_stopword_no_number, vector_size=200, window=6, min_count=2, workers=4, sg=1)
model2 = Word2Vec(sentences=X_test_normalized_without_stopword_no_number, vector_size=200, window=6, min_count=2, workers=4, sg=1)

def sentence_vectorizer(sentence, model):
    words = [word for word in sentence if word in model.wv]
    if len(words) == 0:
        return np.zeros(model.vector_size)
    return np.mean(model.wv[words], axis=0)

X_train_word2vec_vect = np.array([sentence_vectorizer(sentence, model1) for sentence in X_train_normalized_without_stopword_no_number])
X_test_word2vec_vect = np.array([sentence_vectorizer(sentence, model2) for sentence in X_test_normalized_without_stopword_no_number])

df_train_word2vec = pd.DataFrame(X_train_word2vec_vect)
df_test_word2vec = pd.DataFrame(X_test_word2vec_vect)



#Because the distribution of data in the target classes was not the same, according to the existing methods, we generate data from classes with less data.

## 1. SMOTE Method





### Generating data using the SMOTE method based on normalized data and removing the stopword

In [None]:
smote = SMOTE(random_state=42)
X_resampled_TD_without_S, y_resampled_TD_without_S = smote.fit_resample(np.asarray(X_train_tfidf_without_S) , y_train)

### Generating data using the SMOTE method based on normalized data and removing the stopword AND numbers

In [None]:
X_resampled_TD_without_S_and_N, y_resampled_TD_without_S_and_N = smote.fit_resample(np.asarray(X_train_tfidf_without_S_and_N) , y_train)

### Generating data using the SMOTE method based on normalized data and removing the stopword AND numbers WITH wordtovec

In [None]:
X_resampled_word2vec_without_S_and_N, y_resampled_word2vec_without_S_and_N = smote.fit_resample(np.asarray(X_train_word2vec_vect) , y_train)

## 2. RandomOverSampler Method


### Generating data using the RandomOverSampler method based on normalized data and removing the stopword

In [None]:
ros = RandomOverSampler(random_state=42)
X_resampled_ros_TD_without_S_and_N, y_resampled_ros_TD_without_S_and_N = ros.fit_resample(np.asarray(X_train_tfidf_without_S_and_N), y_train)

In [None]:
X_resampled_ros_word2vec_without_S_and_N, y_resampled_ros_word2vec_without_S_and_N = ros.fit_resample(np.asarray(X_train_word2vec_vect), y_train)

# Now, after data CLEANING and data engineering, normalization and data generation for unbalanced classes, we implement classical machine learning models as sentiment analysis.

## RandomForestClassifier

## By applying the random forest model, according to the accuracy values, it can be concluded that the highest accuracy is when kfold and StratifiedKFold techniques are implemented, and in other cases, the accuracy was in the range of 57. Applying the word2vec method has also had the least effective performance.

### Characteristic of the model:

*   without STOPWORD
*   without NUMBER character
*   TfidfVectorizer
*   resampled with SMOTE method



In [None]:
# Initialize the model
rf_clf = RandomForestClassifier(n_estimators=200, random_state=42)
# Train the model
rf_clf.fit(np.asarray(X_resampled_TD_without_S_and_N), y_resampled_TD_without_S_and_N)

# Predict the test set
y_pred_rf = rf_clf.predict(np.asarray(X_test_tfidf_without_S_and_N))

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
conf_matrix = confusion_matrix(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf, average='weighted')
recall_rf = recall_score(y_test, y_pred_rf, average='weighted')
f1_rf = f1_score(y_test, y_pred_rf, average='weighted')
class_report = classification_report(y_test, y_pred_rf)

print(f'Random Forest Accuracy: {accuracy_rf}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Precision: {precision_rf}')
print(f'Recall: {recall_rf}')
print(f'F1 Score: {f1_rf}')
print(f'Classification Report:\n{class_report}')

Random Forest Accuracy: 0.5756345177664974
Confusion Matrix:
[[ 78   4  27  53  25]
 [  4  39   4  12   6]
 [ 22   8 221  61  13]
 [ 29  13  22 136  42]
 [ 20   5  12  36  93]]
Precision: 0.5887285182542942
Recall: 0.5756345177664974
F1 Score: 0.5788163983192438
Classification Report:
              precision    recall  f1-score   support

           0       0.51      0.42      0.46       187
           1       0.57      0.60      0.58        65
           2       0.77      0.68      0.72       325
           3       0.46      0.56      0.50       242
           4       0.52      0.56      0.54       166

    accuracy                           0.58       985
   macro avg       0.56      0.56      0.56       985
weighted avg       0.59      0.58      0.58       985



### Characteristic of the model:
*  without STOPWORD
*  without NUMBER character
*  TfidfVectorizer
*  resampled with SMOTE method
*  KFold

In [None]:
# Initialize the model
rf_clf = RandomForestClassifier(n_estimators=200, random_state=42)

# Assuming X_train and y_train are already defined as your training data
X_train = np.asarray(X_resampled_TD_without_S_and_N)
y_train = y_resampled_TD_without_S_and_N

# Perform cross-validation
y_pred_rf = cross_val_predict(rf_clf, X_train, y_train, cv=5)

# Evaluate the model
accuracy_rf = accuracy_score(y_train, y_pred_rf)
conf_matrix = confusion_matrix(y_train, y_pred_rf)
precision_rf = precision_score(y_train, y_pred_rf, average='weighted')
recall_rf = recall_score(y_train, y_pred_rf, average='weighted')
f1_rf = f1_score(y_train, y_pred_rf, average='weighted')
class_report = classification_report(y_train, y_pred_rf)

# Print the evaluation metrics
print(f'Random Forest Accuracy: {accuracy_rf}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Precision: {precision_rf}')
print(f'Recall: {recall_rf}')
print(f'F1 Score: {f1_rf}')
print(f'Classification Report:\n{class_report}')


Random Forest Accuracy: 0.7102902374670185
Confusion Matrix:
[[ 675   42  121  195  104]
 [  10 1067    5   28   27]
 [  68   11  814  164   80]
 [ 113   71   91  636  226]
 [  56   24   40  171  846]]
Precision: 0.7124772273250439
Recall: 0.7102902374670185
F1 Score: 0.7090303903214129
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.59      0.66      1137
           1       0.88      0.94      0.91      1137
           2       0.76      0.72      0.74      1137
           3       0.53      0.56      0.55      1137
           4       0.66      0.74      0.70      1137

    accuracy                           0.71      5685
   macro avg       0.71      0.71      0.71      5685
weighted avg       0.71      0.71      0.71      5685



### Characteristic of the model:
*  without STOPWORD
*  without NUMBER character
*  TfidfVectorizer
*  resampled with SMOTE method
*  StratifiedKFold

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import StratifiedKFold
import numpy as np

# Initialize the model
rf_clf = RandomForestClassifier(n_estimators=200, random_state=42)

X_train = np.asarray(X_resampled_TD_without_S_and_N)
y_train = y_resampled_TD_without_S_and_N

# Set up Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Arrays to store scores
accuracy_scores = []
conf_matrices = []
precision_scores = []
recall_scores = []
f1_scores = []
class_reports = []

for train_index, test_index in skf.split(X_train, y_train):
    X_train_fold, X_test_fold = X_train[train_index], X_train[test_index]
    y_train_fold, y_test_fold = y_train[train_index], y_train[test_index]

    # Train the model
    rf_clf.fit(X_train_fold, y_train_fold)

    # Predict the test set
    y_pred_rf = rf_clf.predict(X_test_fold)

    # Evaluate the model
    accuracy_rf = accuracy_score(y_test_fold, y_pred_rf)
    conf_matrix = confusion_matrix(y_test_fold, y_pred_rf)
    precision_rf = precision_score(y_test_fold, y_pred_rf, average='weighted')
    recall_rf = recall_score(y_test_fold, y_pred_rf, average='weighted')
    f1_rf = f1_score(y_test_fold, y_pred_rf, average='weighted')
    class_report = classification_report(y_test_fold, y_pred_rf, output_dict=True)

    # Store scores
    accuracy_scores.append(accuracy_rf)
    conf_matrices.append(conf_matrix)
    precision_scores.append(precision_rf)
    recall_scores.append(recall_rf)
    f1_scores.append(f1_rf)
    class_reports.append(class_report)

# Print the average of each metric across all folds
print(f'Average Random Forest Accuracy: {np.mean(accuracy_scores)}')
print(f'Average Confusion Matrix:\n{np.mean(conf_matrices, axis=0)}')
print(f'Average Precision: {np.mean(precision_scores)}')
print(f'Average Recall: {np.mean(recall_scores)}')
print(f'Average F1 Score: {np.mean(f1_scores)}')

# If you want to print the classification report for the last fold, you can do so:
print(f'Classification Report for the last fold:\n{classification_report(y_test_fold, y_pred_rf)}')


Average Random Forest Accuracy: 0.704485488126649
Average Confusion Matrix:
[[132.6   8.   22.2  42.   22.6]
 [  1.8 212.2   1.2   7.2   5. ]
 [ 14.    2.2 162.2  34.2  14.8]
 [ 23.4  13.4  16.4 128.8  45.4]
 [ 10.6   4.8   7.4  39.4 165.2]]
Average Precision: 0.7104977400092999
Average Recall: 0.704485488126649
Average F1 Score: 0.704438958802889
Classification Report for the last fold:
              precision    recall  f1-score   support

           0       0.77      0.59      0.67       227
           1       0.86      0.89      0.88       228
           2       0.78      0.71      0.74       228
           3       0.53      0.60      0.56       227
           4       0.66      0.76      0.71       227

    accuracy                           0.71      1137
   macro avg       0.72      0.71      0.71      1137
weighted avg       0.72      0.71      0.71      1137



### Characteristic of the model:
*  without STOPWORD
*  without NUMBER character
*  resampled with RandomOverSampler method
*  word2vec




In [None]:
# Initialize the model
rf_clf = RandomForestClassifier(n_estimators=200, random_state=42)
# Train the model
rf_clf.fit(np.asarray(X_resampled_word2vec_without_S_and_N), y_resampled_word2vec_without_S_and_N)

# Predict the test set
y_pred_rf = rf_clf.predict(np.asarray(X_test_word2vec_vect))

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
conf_matrix = confusion_matrix(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf, average='weighted')
recall_rf = recall_score(y_test, y_pred_rf, average='weighted')
f1_rf = f1_score(y_test, y_pred_rf, average='weighted')
class_report = classification_report(y_test, y_pred_rf)

print(f'Random Forest Accuracy: {accuracy_rf}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Precision: {precision_rf}')
print(f'Recall: {recall_rf}')
print(f'F1 Score: {f1_rf}')
print(f'Classification Report:\n{class_report}')

Random Forest Accuracy: 0.24771573604060915
Confusion Matrix:
[[  0   0   1 181   5]
 [  0   0   0  61   4]
 [  0   0   2 307  16]
 [  0   0   0 232  10]
 [  0   0   1 155  10]]
Precision: 0.2633216191591826
Recall: 0.24771573604060915
F1 Score: 0.11675822110644178
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       187
           1       0.00      0.00      0.00        65
           2       0.50      0.01      0.01       325
           3       0.25      0.96      0.39       242
           4       0.22      0.06      0.09       166

    accuracy                           0.25       985
   macro avg       0.19      0.21      0.10       985
weighted avg       0.26      0.25      0.12       985



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Characteristic of the model:
*  without STOPWORD
*  without NUMBER character
*  resampled with RandomOverSampler method
*  TF IDF




In [None]:
# Initialize the model
rf_clf = RandomForestClassifier(n_estimators=200, random_state=42)
# Train the model
rf_clf.fit(np.asarray(X_resampled_ros_TD_without_S_and_N), y_resampled_ros_TD_without_S_and_N)

# Predict the test set
y_pred_rf = rf_clf.predict(np.asarray(X_test_tfidf_without_S_and_N))

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
conf_matrix = confusion_matrix(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf, average='weighted')
recall_rf = recall_score(y_test, y_pred_rf, average='weighted')
f1_rf = f1_score(y_test, y_pred_rf, average='weighted')
class_report = classification_report(y_test, y_pred_rf)

print(f'Random Forest Accuracy: {accuracy_rf}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Precision: {precision_rf}')
print(f'Recall: {recall_rf}')
print(f'F1 Score: {f1_rf}')
print(f'Classification Report:\n{class_report}')

Random Forest Accuracy: 0.5644670050761421
Confusion Matrix:
[[ 80   6  25  53  23]
 [  3  35   3  17   7]
 [ 14   7 221  69  14]
 [ 30  16  21 128  47]
 [ 21   6  10  37  92]]
Precision: 0.5842103827194028
Recall: 0.5644670050761421
F1 Score: 0.5699888113456706
Classification Report:
              precision    recall  f1-score   support

           0       0.54      0.43      0.48       187
           1       0.50      0.54      0.52        65
           2       0.79      0.68      0.73       325
           3       0.42      0.53      0.47       242
           4       0.50      0.55      0.53       166

    accuracy                           0.56       985
   macro avg       0.55      0.55      0.54       985
weighted avg       0.58      0.56      0.57       985



### SVM

*  without STOPWORD
*  without NUMBER character
* resampled with SMOTH method
* TF IDF
* poly kernel

In [None]:
# Initialize the model
svm_clf = SVC(kernel='poly', random_state=42)
# Train the model
svm_clf.fit(np.asarray(X_resampled_TD_without_S_and_N), y_resampled_TD_without_S_and_N)

# Predict the test set
y_pred_svm = svm_clf.predict(np.asarray(X_test_tfidf_without_S_and_N))

# Evaluate the model
accuracy_svm= accuracy_score(y_test, y_pred_svm)
conf_matrix = confusion_matrix(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm, average='weighted')
recall_svm = recall_score(y_test,y_pred_svm, average='weighted')
f1_svm = f1_score(y_test, y_pred_svm, average='weighted')
class_report = classification_report(y_test, y_pred_svm)

print(f'svm Accuracy: {accuracy_svm}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Precision: {precision_svm}')
print(f'Recall: {recall_svm}')
print(f'F1 Score: {f1_svm}')
print(f'Classification Report:\n{class_report}')


svm Accuracy: 0.5532994923857868
Confusion Matrix:
[[ 83   3  20  35  46]
 [  4  23   2  18  18]
 [ 17   2 209  44  53]
 [ 29  12   9 124  68]
 [ 14   4   6  36 106]]
Precision: 0.6019392820162709
Recall: 0.5532994923857868
F1 Score: 0.5640267604216717
Classification Report:
              precision    recall  f1-score   support

           0       0.56      0.44      0.50       187
           1       0.52      0.35      0.42        65
           2       0.85      0.64      0.73       325
           3       0.48      0.51      0.50       242
           4       0.36      0.64      0.46       166

    accuracy                           0.55       985
   macro avg       0.56      0.52      0.52       985
weighted avg       0.60      0.55      0.56       985



*  without STOPWORD
*  without NUMBER character
*  resampled with SMOTH method
*  TF IDF
*  Linear kernel
>>>>>>> The highest accuracy

In [None]:
# Initialize the model
svm_clf = SVC(kernel='linear', random_state=42)
# Train the model
svm_clf.fit(np.asarray(X_resampled_TD_without_S_and_N), y_resampled_TD_without_S_and_N)

# Predict the test set
y_pred_svm = svm_clf.predict(np.asarray(X_test_tfidf_without_S_and_N))

# Evaluate the model
accuracy_svm= accuracy_score(y_test, y_pred_svm)
conf_matrix = confusion_matrix(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm, average='weighted')
recall_svm = recall_score(y_test,y_pred_svm, average='weighted')
f1_svm = f1_score(y_test, y_pred_svm, average='weighted')
class_report = classification_report(y_test, y_pred_svm)

print(f'svm Accuracy: {accuracy_svm}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Precision: {precision_svm}')
print(f'Recall: {recall_svm}')
print(f'F1 Score: {f1_svm}')
print(f'Classification Report:\n{class_report}')

svm Accuracy: 0.5888324873096447
Confusion Matrix:
[[ 96   8  19  36  28]
 [  7  37   2  11   8]
 [ 24  11 222  37  31]
 [ 29  23  13 131  46]
 [ 23  11   7  31  94]]
Precision: 0.6148212050631867
Recall: 0.5888324873096447
F1 Score: 0.5970889053964187
Classification Report:
              precision    recall  f1-score   support

           0       0.54      0.51      0.52       187
           1       0.41      0.57      0.48        65
           2       0.84      0.68      0.76       325
           3       0.53      0.54      0.54       242
           4       0.45      0.57      0.50       166

    accuracy                           0.59       985
   macro avg       0.56      0.57      0.56       985
weighted avg       0.61      0.59      0.60       985



*  without STOPWORD
*  without NUMBER character
*  resampled with RandomOverSampler method
*  TF IDF
*  linear kernel

In [None]:
# Initialize the model
svm_clf = SVC(kernel='linear', random_state=42)
# Train the model
svm_clf.fit(np.asarray(X_resampled_ros_TD_without_S_and_N), y_resampled_ros_TD_without_S_and_N)

# Predict the test set
y_pred_svm = svm_clf.predict(np.asarray(X_test_tfidf_without_S_and_N))

# Evaluate the model
accuracy_svm= accuracy_score(y_test, y_pred_svm)
conf_matrix = confusion_matrix(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm, average='weighted')
recall_svm = recall_score(y_test,y_pred_svm, average='weighted')
f1_svm = f1_score(y_test, y_pred_svm, average='weighted')
class_report = classification_report(y_test, y_pred_svm)

print(f'svm Accuracy: {accuracy_svm}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Precision: {precision_svm}')
print(f'Recall: {recall_svm}')
print(f'F1 Score: {f1_svm}')
print(f'Classification Report:\n{class_report}')

svm Accuracy: 0.566497461928934
Confusion Matrix:
[[100   8  20  35  24]
 [  7  37   2  14   5]
 [ 32  10 217  44  22]
 [ 38  28  13 113  50]
 [ 26   8   8  33  91]]
Precision: 0.5917688237611483
Recall: 0.566497461928934
F1 Score: 0.5745560644661686
Classification Report:
              precision    recall  f1-score   support

           0       0.49      0.53      0.51       187
           1       0.41      0.57      0.47        65
           2       0.83      0.67      0.74       325
           3       0.47      0.47      0.47       242
           4       0.47      0.55      0.51       166

    accuracy                           0.57       985
   macro avg       0.54      0.56      0.54       985
weighted avg       0.59      0.57      0.57       985



### LogisticRegression


*  without STOPWORD
*  without NUMBER character
*  resampled with RandomOverSampler method
*  TF IDF


In [None]:
# Initialize the model
logreg_clf = LogisticRegression(random_state=42, max_iter=1000)
# Train the model
logreg_clf.fit(np.asarray(X_resampled_ros_TD_without_S_and_N), y_resampled_ros_TD_without_S_and_N)

# Predict the test set
y_pred_logreg = logreg_clf.predict(np.asarray(X_test_tfidf_without_S_and_N))

# Evaluate the model
accuracy_logreg= accuracy_score(y_test, y_pred_logreg)
conf_matrix = confusion_matrix(y_test, y_pred_logreg)
precision_logreg = precision_score(y_test, y_pred_logreg, average='weighted')
recall_logreg = recall_score(y_test,y_pred_logreg, average='weighted')
f1_logreg = f1_score(y_test, y_pred_logreg, average='weighted')
class_report = classification_report(y_test, y_pred_logreg)

print(f'svm Accuracy: {accuracy_logreg}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Precision: {precision_logreg}')
print(f'Recall: {recall_logreg}')
print(f'F1 Score: {f1_logreg}')
print(f'Classification Report:\n{class_report}')


svm Accuracy: 0.5776649746192893
Confusion Matrix:
[[ 97   9  20  35  26]
 [  8  38   2  10   7]
 [ 22   9 226  47  21]
 [ 38  27  10 117  50]
 [ 21  10  10  34  91]]
Precision: 0.6011502482493198
Recall: 0.5776649746192893
F1 Score: 0.585480030576791
Classification Report:
              precision    recall  f1-score   support

           0       0.52      0.52      0.52       187
           1       0.41      0.58      0.48        65
           2       0.84      0.70      0.76       325
           3       0.48      0.48      0.48       242
           4       0.47      0.55      0.50       166

    accuracy                           0.58       985
   macro avg       0.54      0.57      0.55       985
weighted avg       0.60      0.58      0.59       985



*  without STOPWORD
*  without NUMBER character
*  resampled with smoth method
*  TF IDF

In [None]:
# Initialize the model
logreg_clf = LogisticRegression(random_state=42, max_iter=1000)
# Train the model
logreg_clf.fit(np.asarray(X_resampled_TD_without_S_and_N), y_resampled_TD_without_S_and_N)

# Predict the test set
y_pred_logreg = logreg_clf.predict(np.asarray(X_test_tfidf_without_S_and_N))

# Evaluate the model
accuracy_logreg= accuracy_score(y_test, y_pred_logreg)
conf_matrix = confusion_matrix(y_test, y_pred_logreg)
precision_logreg = precision_score(y_test, y_pred_logreg, average='weighted')
recall_logreg = recall_score(y_test,y_pred_logreg, average='weighted')
f1_logreg = f1_score(y_test, y_pred_logreg, average='weighted')
class_report = classification_report(y_test, y_pred_logreg)

print(f'svm Accuracy: {accuracy_logreg}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Precision: {precision_logreg}')
print(f'Recall: {recall_logreg}')
print(f'F1 Score: {f1_logreg}')
print(f'Classification Report:\n{class_report}')


svm Accuracy: 0.5644670050761421
Confusion Matrix:
[[ 86  11  24  33  33]
 [  7  33   2  14   9]
 [ 23   8 222  38  34]
 [ 34  23  14 118  53]
 [ 20  10  10  29  97]]
Precision: 0.5882503975289286
Recall: 0.5644670050761421
F1 Score: 0.5716206025483137
Classification Report:
              precision    recall  f1-score   support

           0       0.51      0.46      0.48       187
           1       0.39      0.51      0.44        65
           2       0.82      0.68      0.74       325
           3       0.51      0.49      0.50       242
           4       0.43      0.58      0.49       166

    accuracy                           0.56       985
   macro avg       0.53      0.54      0.53       985
weighted avg       0.59      0.56      0.57       985



*  without STOPWORD
*  without NUMBER character
*  resampled with RandomOverSampler method
*  word2vec_


In [None]:
# Initialize the model
logreg_clf = LogisticRegression(random_state=42, max_iter=1000)
# Train the model
logreg_clf.fit(np.asarray(X_resampled_ros_word2vec_without_S_and_N), y_resampled_ros_word2vec_without_S_and_N)

# Predict the test set
y_pred_logreg = logreg_clf.predict(np.asarray(X_test_word2vec_vect))

# Evaluate the model
accuracy_logreg= accuracy_score(y_test, y_pred_logreg)
conf_matrix = confusion_matrix(y_test, y_pred_logreg)
precision_logreg = precision_score(y_test, y_pred_logreg, average='weighted')
recall_logreg = recall_score(y_test,y_pred_logreg, average='weighted')
f1_logreg = f1_score(y_test, y_pred_logreg, average='weighted')
class_report = classification_report(y_test, y_pred_logreg)

print(f'svm Accuracy: {accuracy_logreg}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Precision: {precision_logreg}')
print(f'Recall: {recall_logreg}')
print(f'F1 Score: {f1_logreg}')
print(f'Classification Report:\n{class_report}')

svm Accuracy: 0.16954314720812183
Confusion Matrix:
[[  0   0   0   0 187]
 [  0   0   0   0  65]
 [  0   0   0   0 325]
 [  0   0   0   1 241]
 [  0   0   0   0 166]]
Precision: 0.2741158020717263
Recall: 0.16954314720812183
F1 Score: 0.05067537848547682
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       187
           1       0.00      0.00      0.00        65
           2       0.00      0.00      0.00       325
           3       1.00      0.00      0.01       242
           4       0.17      1.00      0.29       166

    accuracy                           0.17       985
   macro avg       0.23      0.20      0.06       985
weighted avg       0.27      0.17      0.05       985



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### DecisionTreeClassifier

*  without STOPWORD
*  without NUMBER character
*  resampled with smoth method
*  word2vec


In [None]:
# Initialize the model
clf = DecisionTreeClassifier()
# Train the model
clf.fit(np.asarray(X_resampled_word2vec_without_S_and_N), y_resampled_word2vec_without_S_and_N)

# Predict the test set
y_pred = clf.predict(np.asarray(X_test_word2vec_vect))

# Evaluate the model
accuracy_DT = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
precision_DT = precision_score(y_test, y_pred, average='weighted')
recall_DT = recall_score(y_test, y_pred, average='weighted')
f1_DT = f1_score(y_test, y_pred, average='weighted')
class_report = classification_report(y_test, y_pred)

print(f'Decision Tree Accuracy: {accuracy_DT}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Precision: {precision_DT}')
print(f'Recall: {recall_DT}')
print(f'F1 Score: {f1_DT}')
print(f'Classification Report:\n{class_report}')

Decision Tree Accuracy: 0.23756345177664975
Confusion Matrix:
[[  0   0   6 181   0]
 [  0   0   1  64   0]
 [  0   0  12 313   0]
 [  0   0  20 222   0]
 [  0   0   5 161   0]]
Precision: 0.14794803905750786
Recall: 0.23756345177664975
F1 Score: 0.11366997272274365
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       187
           1       0.00      0.00      0.00        65
           2       0.27      0.04      0.07       325
           3       0.24      0.92      0.38       242
           4       0.00      0.00      0.00       166

    accuracy                           0.24       985
   macro avg       0.10      0.19      0.09       985
weighted avg       0.15      0.24      0.11       985



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


*  without STOPWORD
*  without NUMBER character
*  resampled with smoth method
*  tfidf

In [None]:
# Initialize the model
clf = DecisionTreeClassifier()
# Train the model
clf.fit(np.asarray(X_resampled_TD_without_S_and_N), y_resampled_TD_without_S_and_N)

# Predict the test set
y_pred = clf.predict(np.asarray(X_test_tfidf_without_S_and_N))

# Evaluate the model
accuracy_DT = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
precision_DT = precision_score(y_test, y_pred, average='weighted')
recall_DT = recall_score(y_test, y_pred, average='weighted')
f1_DT = f1_score(y_test, y_pred, average='weighted')
class_report = classification_report(y_test, y_pred)

print(f'Decision Tree Accuracy: {accuracy_DT}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Precision: {precision_DT}')
print(f'Recall: {recall_DT}')
print(f'F1 Score: {f1_DT}')
print(f'Classification Report:\n{class_report}')

Decision Tree Accuracy: 0.5126903553299492
Confusion Matrix:
[[ 72   7  28  46  34]
 [  2  37   5  16   5]
 [ 32   8 205  57  23]
 [ 37  19  25 121  40]
 [ 24   7  18  47  70]]
Precision: 0.5260322294942221
Recall: 0.5126903553299492
F1 Score: 0.5168051162172637
Classification Report:
              precision    recall  f1-score   support

           0       0.43      0.39      0.41       187
           1       0.47      0.57      0.52        65
           2       0.73      0.63      0.68       325
           3       0.42      0.50      0.46       242
           4       0.41      0.42      0.41       166

    accuracy                           0.51       985
   macro avg       0.49      0.50      0.49       985
weighted avg       0.53      0.51      0.52       985



*  without STOPWORD
*  without NUMBER character
*  resampled with ros method
*  tfidf

In [None]:
# Initialize the model
clf = DecisionTreeClassifier()
# Train the model
clf.fit(np.asarray(X_resampled_ros_TD_without_S_and_N), y_resampled_ros_TD_without_S_and_N)

# Predict the test set
y_pred = clf.predict(np.asarray(X_test_tfidf_without_S_and_N))

# Evaluate the model
accuracy_DT = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
precision_DT = precision_score(y_test, y_pred, average='weighted')
recall_DT = recall_score(y_test, y_pred, average='weighted')
f1_DT = f1_score(y_test, y_pred, average='weighted')
class_report = classification_report(y_test, y_pred)

print(f'Decision Tree Accuracy: {accuracy_DT}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Precision: {precision_DT}')
print(f'Recall: {recall_DT}')
print(f'F1 Score: {f1_DT}')
print(f'Classification Report:\n{class_report}')

Decision Tree Accuracy: 0.49137055837563454
Confusion Matrix:
[[ 70   7  30  50  30]
 [  8  30   1  17   9]
 [ 31  10 199  52  33]
 [ 45  19  24 108  46]
 [ 28   8  17  36  77]]
Precision: 0.5094951356446954
Recall: 0.49137055837563454
F1 Score: 0.49782698488064303
Classification Report:
              precision    recall  f1-score   support

           0       0.38      0.37      0.38       187
           1       0.41      0.46      0.43        65
           2       0.73      0.61      0.67       325
           3       0.41      0.45      0.43       242
           4       0.39      0.46      0.43       166

    accuracy                           0.49       985
   macro avg       0.47      0.47      0.47       985
weighted avg       0.51      0.49      0.50       985



### GradientBoostingClassifier

*  without STOPWORD
*  without NUMBER character
*  resampled with ros method
*  tfidf

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
# Initialize the model
gb_clf = GradientBoostingClassifier(n_estimators=150, random_state=42)
# Train the model
gb_clf.fit(np.asarray(X_resampled_ros_TD_without_S_and_N), y_resampled_ros_TD_without_S_and_N)

# Predict the test set
y_pred_gb = gb_clf.predict(np.asarray(X_test_tfidf_without_S_and_N))

# Evaluate the model
accuracy_gb= accuracy_score(y_test, y_pred_gb)
conf_matrix = confusion_matrix(y_test, y_pred_gb)
precision_gb = precision_score(y_test, y_pred_gb, average='weighted')
recall_gb = recall_score(y_test,y_pred_gb, average='weighted')
f1_gb = f1_score(y_test, y_pred_gb, average='weighted')
class_report = classification_report(y_test, y_pred_gb)

print(f'svm Accuracy: {accuracy_gb}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Precision: {precision_gb}')
print(f'Recall: {recall_gb}')
print(f'F1 Score: {f1_gb}')
print(f'Classification Report:\n{class_report}')

svm Accuracy: 0.550253807106599
Confusion Matrix:
[[ 79   9  22  53  24]
 [  3  36   2  16   8]
 [ 19   7 203  79  17]
 [ 28  25  11 133  45]
 [ 15   9   2  49  91]]
Precision: 0.5927744170802842
Recall: 0.550253807106599
F1 Score: 0.5608208893238406
Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.42      0.48       187
           1       0.42      0.55      0.48        65
           2       0.85      0.62      0.72       325
           3       0.40      0.55      0.47       242
           4       0.49      0.55      0.52       166

    accuracy                           0.55       985
   macro avg       0.54      0.54      0.53       985
weighted avg       0.59      0.55      0.56       985



### XGBClassifier

*  without STOPWORD
*  without NUMBER character
*  resampled with ros method
*  tfidf

In [None]:
import xgboost as xgb
# Initialize the model
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
# Train the model
xgb_clf.fit(np.asarray(X_resampled_ros_TD_without_S_and_N), y_resampled_ros_TD_without_S_and_N)

# Predict the test set
y_pred_xgb = xgb_clf.predict(np.asarray(X_test_tfidf_without_S_and_N))

# Evaluate the model
accuracy_xgb= accuracy_score(y_test, y_pred_xgb)
conf_matrix = confusion_matrix(y_test, y_pred_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb, average='weighted')
recall_xgb = recall_score(y_test,y_pred_xgb, average='weighted')
f1_xgb = f1_score(y_test, y_pred_xgb, average='weighted')
class_report = classification_report(y_test, y_pred_xgb)

print(f'svm Accuracy: {accuracy_xgb}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Precision: {precision_xgb}')
print(f'Recall: {recall_xgb}')
print(f'F1 Score: {f1_xgb}')
print(f'Classification Report:\n{class_report}')

svm Accuracy: 0.5543147208121827
Confusion Matrix:
[[ 90   8  21  39  29]
 [  5  33   3  15   9]
 [ 23   9 206  71  16]
 [ 33  23  16 127  43]
 [ 23   7   5  41  90]]
Precision: 0.5838141642037349
Recall: 0.5543147208121827
F1 Score: 0.5632810479182196
Classification Report:
              precision    recall  f1-score   support

           0       0.52      0.48      0.50       187
           1       0.41      0.51      0.46        65
           2       0.82      0.63      0.72       325
           3       0.43      0.52      0.47       242
           4       0.48      0.54      0.51       166

    accuracy                           0.55       985
   macro avg       0.53      0.54      0.53       985
weighted avg       0.58      0.55      0.56       985



*  without STOPWORD
*  without NUMBER character
*  resampled with smoth method
*  tfidf

In [None]:
# Initialize the model
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
# Train the model
xgb_clf.fit(np.asarray(X_resampled_TD_without_S_and_N), y_resampled_TD_without_S_and_N)

# Predict the test set
y_pred_xgb = xgb_clf.predict(np.asarray(X_test_tfidf_without_S_and_N))

# Evaluate the model
accuracy_xgb= accuracy_score(y_test, y_pred_xgb)
conf_matrix = confusion_matrix(y_test, y_pred_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb, average='weighted')
recall_xgb = recall_score(y_test,y_pred_xgb, average='weighted')
f1_xgb = f1_score(y_test, y_pred_xgb, average='weighted')
class_report = classification_report(y_test, y_pred_xgb)

print(f'svm Accuracy: {accuracy_xgb}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Precision: {precision_xgb}')
print(f'Recall: {recall_xgb}')
print(f'F1 Score: {f1_xgb}')
print(f'Classification Report:\n{class_report}')

svm Accuracy: 0.5573604060913706
Confusion Matrix:
[[ 84   4  22  52  25]
 [  6  35   3  15   6]
 [ 24   6 213  65  17]
 [ 30  15  21 129  47]
 [ 27   5   7  39  88]]
Precision: 0.5796844128295273
Recall: 0.5573604060913706
F1 Score: 0.5643930846085555
Classification Report:
              precision    recall  f1-score   support

           0       0.49      0.45      0.47       187
           1       0.54      0.54      0.54        65
           2       0.80      0.66      0.72       325
           3       0.43      0.53      0.48       242
           4       0.48      0.53      0.50       166

    accuracy                           0.56       985
   macro avg       0.55      0.54      0.54       985
weighted avg       0.58      0.56      0.56       985



## Applying the strongest model to the test dataset

In [None]:
file_path_test = '/content/3rdHW_test.csv'
test = pd.read_csv(file_path_test, header=None)
test.columns = ['Text']
print(test.head(5))

                                                Text
0               صعب روزی، بوالعجب کاری، پریشان عالمی
1         بسیار نرم و لطیف بوده و کیفیت بالایی داره.
2      اصلا رنگش با چیزی که تو عکس بود خیلی فرق داشت
3            خیلی زیبا و ب اندازه و با دقت طراحی شده
4  سبزی پلو با ماهی مال عید نوروزه، امشب سوشی میخ...


In [None]:
test[['Cleaned_With_Numbers', 'Cleaned_Without_Numbers']] = test['Text'].progress_apply(preprocess).apply(pd.Series)
test_norm = transformer.transform(test['Cleaned_Without_Numbers'])
test_norm_Tf = vectorizer2.transform(test_norm).todense()
pd.DataFrame(test_norm_Tf, columns=vectorizer2.get_feature_names_out())


label_dict = {
    0: 'ANGRY',
    1: 'FEAR',
    2: 'HAPPY',
    3: 'OTHER',
    4: 'SAD'
}

y_pred_rf = rf_clf.predict(np.asarray(test_norm_Tf))

df = pd.DataFrame({
    'X': test_norm,
    'predicted': y_pred_rf})
df['y'] = df['predicted'].map(label_dict);df

  0%|          | 0/548 [00:00<?, ?it/s]

Unnamed: 0,X,predicted,y
0,صعب روزی بوالعجب کاری پریشان عالمی,3,OTHER
1,نرم لطیف کیفیت بالایی داره,2,HAPPY
2,اصلا رنگش عکس فرق,0,ANGRY
3,زیبا ب اندازه دقت طراحی‌شده,2,HAPPY
4,سبزی‌پلو ماهی مال عید نوروزه امشب سوشی می‌خوری...,1,FEAR
...,...,...,...
543,سرخط خبرهای عصر پنجشنبه,3,OTHER
544,بوی ماندگاری خوشم امدش مرسی دیجی,2,HAPPY
545,گاز نداریم اینترنت وطن,4,SAD
546,چندتاشو برا مغازه گرفتم باطریاشون کلا خرابن,2,HAPPY


##Application of the most powerful model with pipeline

In [None]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline

tqdm.pandas()

# Pipeline definition
pipeline = Pipeline([
    ('normalize_and_remove_stopwords', transformer),
    ('vectorizer', TfidfVectorizer(min_df=2, max_features=1000, ngram_range=(1, 4))),
    ('oversampling', RandomOverSampler(random_state=42)),
    ('classifier', SVC(kernel='linear', random_state=42))
])

# Train the pipeline
pipeline.fit(X_train_without_num, y_train)

# Predict the test set
y_pred_svm = pipeline.predict(X_test_without_num)

# Evaluate the model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
conf_matrix = confusion_matrix(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm, average='weighted')
recall_svm = recall_score(y_test, y_pred_svm, average='weighted')
f1_svm = f1_score(y_test, y_pred_svm, average='weighted')
class_report = classification_report(y_test, y_pred_svm)

print(f'SVM Accuracy: {accuracy_svm}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Precision: {precision_svm}')
print(f'Recall: {recall_svm}')
print(f'F1 Score: {f1_svm}')
print(f'Classification Report:\n{class_report}')


SVM Accuracy: 0.566497461928934
Confusion Matrix:
[[100   8  20  35  24]
 [  7  37   2  14   5]
 [ 32  10 217  44  22]
 [ 38  28  13 113  50]
 [ 26   8   8  33  91]]
Precision: 0.5917688237611483
Recall: 0.566497461928934
F1 Score: 0.5745560644661686
Classification Report:
              precision    recall  f1-score   support

           0       0.49      0.53      0.51       187
           1       0.41      0.57      0.47        65
           2       0.83      0.67      0.74       325
           3       0.47      0.47      0.47       242
           4       0.47      0.55      0.51       166

    accuracy                           0.57       985
   macro avg       0.54      0.56      0.54       985
weighted avg       0.59      0.57      0.57       985

