In [1]:
!pip install gdown

Collecting gdown
  Downloading gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Downloading gdown-5.2.0-py3-none-any.whl (18 kB)
Installing collected packages: gdown
Successfully installed gdown-5.2.0


# Importing Libraries

In [2]:
import gdown
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier

from xgboost import XGBClassifier

# Importing Data

In [3]:
gdown.download("https://drive.google.com/uc?id=12OqvVFIDZcs1MsG8wJMbu-3zGKEKCLNy", '/kaggle/working/data.csv', quiet=False)
print('/kaggle/working/data.csv')

Downloading...
From: https://drive.google.com/uc?id=12OqvVFIDZcs1MsG8wJMbu-3zGKEKCLNy
To: /kaggle/working/data.csv
100%|██████████| 736k/736k [00:00<00:00, 98.5MB/s]

/kaggle/working/data.csv





# Data Preprocessing

In [4]:
df = pd.read_csv("/kaggle/working/data.csv")

In [5]:
df.head()

Unnamed: 0,text,comp,en1,en2
0,لابتوب اسوس افضل من لابتوبات ابل ؟ احس مستحيل ...,1,اسوس,ابل
1,اسوس افضل من وجهة نظري,1,اسوس,
2,كلهم نفس المواصفات مافي فرق اسوس افضل من hp في...,1,اسوس,hp
3,التخصصات الهندسية بالذات الحاسب والبرمجيات ماي...,1,اسوس,
4,اسوس افضل من ناحية التنوع والاسعار والعروض,1,اسوس,


In [6]:
df.shape

(4599, 4)

In [7]:
X = df['text']
y = df['comp']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [9]:
vectorizer = TfidfVectorizer(max_features=5000)

In [10]:
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [11]:
print(X_train.shape)
print(X_test.shape)

(3679, 5000)
(920, 5000)


# Modeling

In [12]:
lrc = LogisticRegression(random_state=42)
lrc.fit(X_train, y_train)

y_pred = lrc.predict(X_test)

print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred))
print('Recall: ', recall_score(y_test, y_pred))
print('F1 Score: ',f1_score(y_test, y_pred))

Accuracy:  0.9891304347826086
Precision:  0.9871794871794872
Recall:  0.9914163090128756
F1 Score:  0.9892933618843683


In [13]:
svc = SVC(random_state=42)
svc.fit(X_train, y_train)

y_pred = svc.predict(X_test)

print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred))
print('Recall: ', recall_score(y_test, y_pred))
print('F1 Score: ',f1_score(y_test, y_pred))

Accuracy:  0.9923913043478261
Precision:  0.9893390191897654
Recall:  0.9957081545064378
F1 Score:  0.9925133689839571


In [14]:
rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train, y_train)

y_pred = rfc.predict(X_test)

print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred))
print('Recall: ', recall_score(y_test, y_pred))
print('F1 Score: ',f1_score(y_test, y_pred))

Accuracy:  0.9978260869565218
Precision:  0.9978540772532188
Recall:  0.9978540772532188
F1 Score:  0.9978540772532188


In [15]:
abc = AdaBoostClassifier(random_state=42)
abc.fit(X_train, y_train)

y_pred = abc.predict(X_test)

print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred))
print('Recall: ', recall_score(y_test, y_pred))
print('F1 Score: ',f1_score(y_test, y_pred))

Accuracy:  0.9945652173913043
Precision:  0.9914712153518124
Recall:  0.9978540772532188
F1 Score:  0.9946524064171122


In [16]:
bgc = BaggingClassifier(random_state=42)
bgc.fit(X_train, y_train)

y_pred = bgc.predict(X_test)

print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred))
print('Recall: ', recall_score(y_test, y_pred))
print('F1 Score: ',f1_score(y_test, y_pred))

Accuracy:  0.9967391304347826
Precision:  0.9957173447537473
Recall:  0.9978540772532188
F1 Score:  0.9967845659163987


In [17]:
gbt = GradientBoostingClassifier(random_state=42)
gbt.fit(X_train, y_train)

y_pred = gbt.predict(X_test)

print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred))
print('Recall: ', recall_score(y_test, y_pred))
print('F1 Score: ',f1_score(y_test, y_pred))

Accuracy:  0.9978260869565218
Precision:  0.9957264957264957
Recall:  1.0
F1 Score:  0.9978586723768736


In [18]:
xgb = XGBClassifier(random_state=42)
xgb.fit(X_train, y_train)

y_pred = xgb.predict(X_test)

print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred))
print('Recall: ', recall_score(y_test, y_pred))
print('F1 Score: ',f1_score(y_test, y_pred))

Accuracy:  0.9967391304347826
Precision:  0.9957173447537473
Recall:  0.9978540772532188
F1 Score:  0.9967845659163987


In [19]:
with open('/kaggle/working/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)
    
with open('/kaggle/working/gbt.pkl', 'wb') as f:
    pickle.dump(gbt, f)

# Testing

In [20]:
with open('/kaggle/working/tfidf_vectorizer.pkl', 'rb') as f:
    loaded_vectorizer = pickle.load(f)

In [21]:
with open('/kaggle/working/gbt.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

In [22]:
def apply_model(sentence, loaded_vectorizer, loaded_model):
    sentence_transformed = loaded_vectorizer.transform([sentence])
    prediction = loaded_model.predict(sentence_transformed)
    label = "comparative" if prediction == 1 else "non comparative"
    return label

In [23]:
# NEED PREPROCESS FUNCTION
apply_model("سماعات ابل افضل من البيتس", loaded_vectorizer, loaded_model)

'comparative'