Importing Libraries


In [1]:
import numpy as np
import pandas as pd
import re
import string
import nltk
import demoji
import csv
import seaborn as sns
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.isri import ISRIStemmer
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.metrics import confusion_matrix

Downloading NLTK Data


In [None]:
nltk.download('stopwords')
nltk.download('punkt')

 Loading Dataset

In [None]:
df = pd.read_csv('./train.csv')
print(df.head())

Preprocessing Functions

In [None]:
stop_words = stopwords.words('arabic')
translator = str.maketrans('', '', string.punctuation)

def removeStopWords(text):
    word_tokens = word_tokenize(text)
    filtered_sentence = [w for w in word_tokens if w not in stop_words]
    return ' '.join(filtered_sentence)

def NormalizeArabic(text):
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    return text

def arabic_diacritics(text):
    arabic_diacritics = re.compile(""" ّ|َ|ً|ُ|ٌ|ِ|ٍ|ْ|ـ""", re.VERBOSE)
    return re.sub(arabic_diacritics, '', text)

def removeNumbers(text):
    return ''.join([i for i in text if not i.isdigit()])

def stemming(text):
    st = ISRIStemmer()
    stemmed_words = [st.stem(w) for w in word_tokenize(text)]
    return " ".join(stemmed_words)

def remove_english_characters(text):
    return re.sub(r'[a-zA-Z]+', '', text)

def process(text):
    text = " ".join(re.split(r'[؟،.!()]', text))  # Splitting on Arabic and punctuation characters
    return " ".join(text.split())  # Remove multiple spaces

# Preprocessing Data
def preprocess_data(df, column_name):
    for index, row in df.iterrows():
        row[column_name] = removeStopWords(row[column_name])
        row[column_name] = NormalizeArabic(row[column_name])
        row[column_name] = arabic_diacritics(row[column_name])
        row[column_name] = removeNumbers(row[column_name])
        row[column_name] = row[column_name].translate(translator)
        row[column_name] = stemming(row[column_name])
        row[column_name] = process(row[column_name])
        row[column_name] = demoji.replace(row[column_name], "")
        df.at[index, column_name] = row[column_name]

preprocess_data(df, 'comment')

Feature Extraction


In [None]:
tfidf = TfidfVectorizer(sublinear_tf=True, encoding='utf-8', ngram_range=(1, 2))
features = tfidf.fit_transform(df['comment']).toarray()
labels = df['label']

Model Training and Hyperparameter Tuning


In [None]:
param_grid_rf = {'n_estimators': [800, 1000], 'max_features': [1, 0.5, 0.2], 'random_state': [3, 4, 5]}
grid_search_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=5)
grid_search_rf.fit(features, labels)

print("Final Score: ", round(grid_search_rf.score(features, labels) * 100, 4), "%")
print("Best Parameters: ", grid_search_rf.best_params_)
print("Best Estimator: ", grid_search_rf.best_estimator_)


 Testing

In [None]:
df_test = pd.read_csv('./test.csv')
df_unseen = pd.DataFrame(data=df_test['comment'])

preprocess_data(df_unseen, 'comment')
test_features = tfidf.transform(df_unseen['comment']).toarray()
y_pred = grid_search_rf.predict(test_features)

# Saving Predictions to CSV
csv_columns = ['id', 'label']
csv_file = "prediction.csv"

try:
    with open(csv_file, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
        writer.writeheader()
        for i in range(df_test.shape[0]):
            data = {'id': i + 1, 'label': y_pred[i]}
            writer.writerow(data)
except IOError:
    print("I/O error")