<a href="https://colab.research.google.com/github/Ahmed-A-Salem/Fake_Reddit_Post/blob/main/Fake_Reddit_Post_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Kaggle API

In [None]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle competitions download -c cisc-873-dm-f22-a3

Downloading cisc-873-dm-f22-a3.zip to /content
  0% 0.00/5.62M [00:00<?, ?B/s]
100% 5.62M/5.62M [00:00<00:00, 74.8MB/s]


In [None]:
!unzip '/content/cisc-873-dm-f22-a3.zip'

Archive:  /content/cisc-873-dm-f22-a3.zip
  inflating: sample_submission.csv   
  inflating: x_test.csv              
  inflating: xy_train.csv            


# Imports

In [None]:
import re
import nltk
import string
import sklearn
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from nltk.tokenize import word_tokenize
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

# Training

In [None]:
# Reading the training dataset
data = pd.read_csv('/content/xy_train.csv', sep=",", na_values=[""])
data

Unnamed: 0,id,text,label
0,265723,A group of friends began to volunteer at a hom...,0
1,284269,British Prime Minister @Theresa_May on Nerve A...,0
2,207715,"In 1961, Goodyear released a kit that allows P...",0
3,551106,"Happy Birthday, Bob Barker! The Price Is Right...",0
4,8584,"Obama to Nation: 聙""Innocent Cops and Unarmed Y...",0
...,...,...,...
59995,70046,Finish Sniper Simo H盲yh盲 during the invasion o...,0
59996,189377,Nigerian Prince Scam took $110K from Kansas ma...,1
59997,93486,Is It Safe To Smoke Marijuana During Pregnancy...,0
59998,140950,Julius Caesar upon realizing that everyone in ...,0


In [None]:
# Checking the output labels. It should be 0 or 1 so if there are any other label, they should be removed
data['label'].value_counts(normalize=True)

0    0.536200
1    0.459933
2    0.003867
Name: label, dtype: float64

In [None]:
# Found output label = 2, so it should be removed
data = data[data['label']!=2]

In [None]:
# Making sure the labels are removed, noticed that the dataset is balanced
data['label'].value_counts(normalize=True)

0    0.538281
1    0.461719
Name: label, dtype: float64

In [None]:
# Setting up the preprocessing function for the text
nltk.download('punkt')
def clean_text(text):

    # Removing white spaces
    RE_WSPACE = re.compile(r"\s+", re.IGNORECASE)  

    # Removing html tags
    RE_TAGS = re.compile(r"<[^>]+>")

    # Keeping only ASCII characters and punctuation
    RE_ASCII = re.compile(r"[^A-Za-z,.!? ]", re.IGNORECASE)
    RE_SINGLECHAR = re.compile(r"\b[A-Za-z,.!?]\b", re.IGNORECASE)

    text = re.sub(RE_TAGS, " ", text)
    text = re.sub(RE_ASCII, " ", text)
    text = re.sub(RE_SINGLECHAR, " ", text)
    text = re.sub(RE_WSPACE, " ", text)

    # Tokenizing the text and changing all of them to lower case
    word_tokens = word_tokenize(text)
    words_tokens_lower = [word.lower() for word in word_tokens]
  
    text_clean = " ".join(words_tokens_lower)
    return text_clean

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# Preprocessing the text using the fuction defined above
data["text_clean"] = data["text"].map( lambda x: clean_text(x) if isinstance(x, str) else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
# Desplaying the dataset before and after the function
data

Unnamed: 0,id,text,label,text_clean
0,265723,A group of friends began to volunteer at a hom...,0,group of friends began to volunteer at homeles...
1,284269,British Prime Minister @Theresa_May on Nerve A...,0,british prime minister theresa may on nerve at...
2,207715,"In 1961, Goodyear released a kit that allows P...",0,"in , goodyear released kit that allows ps to b..."
3,551106,"Happy Birthday, Bob Barker! The Price Is Right...",0,"happy birthday , bob barker ! the price is rig..."
4,8584,"Obama to Nation: 聙""Innocent Cops and Unarmed Y...",0,obama to nation innocent cops and unarmed youn...
...,...,...,...,...
59995,70046,Finish Sniper Simo H盲yh盲 during the invasion o...,0,finish sniper simo yh during the invasion of f...
59996,189377,Nigerian Prince Scam took $110K from Kansas ma...,1,nigerian prince scam took from kansas man year...
59997,93486,Is It Safe To Smoke Marijuana During Pregnancy...,0,is it safe to smoke marijuana during pregnancy...
59998,140950,Julius Caesar upon realizing that everyone in ...,0,julius caesar upon realizing that everyone in ...


In [None]:
# Word Frequency of most common words
word_freq = pd.Series(" ".join(data["text_clean"]).split()).value_counts()
word_freq[1:40]

,        41689
.        40001
to       30434
of       29138
in       23552
and      19733
for      12312
it       11347
on       11230
this     10307
is        9990
with      8492
?         7716
from      7487
my        7291
that      7017
you       6486
his       6391
at        6349
by        5455
was       5443
he        5207
after     5124
!         4906
an        4530
has       4194
as        4163
they      3938
are       3786
be        3460
out       3404
have      3401
one       3146
her       3112
new       2997
but       2962
who       2900
like      2899
up        2846
dtype: int64

In [None]:
# Desplaying the least frequency words
word_freq[-10:].reset_index(name="freq")

Unnamed: 0,index,freq
0,unclassified,1
1,sahrawis,1
2,ppbhtzvzvk,1
3,aerbghkwhgtoxisihluc,1
4,melanin,1
5,blessence,1
6,vlc,1
7,latore,1
8,heythatsluke,1
9,wahre,1


In [None]:
# Splitting the dataset to train and test sets
train, test = train_test_split(data, random_state=1, test_size=0.25, shuffle=True)

X_train = train["text_clean"]
Y_train = train["label"]
X_test = test["text_clean"]
Y_test = test["label"]

print(X_train.shape)
print(X_test.shape)

(44826,)
(14942,)


In [None]:
# Defining the Vectorizer, setting it parameters and fitting it with the dataset
vectorizer = TfidfVectorizer( analyzer="word", max_df=0.3, min_df=10, ngram_range=(1, 2), norm="l2" )
vectorizer.fit(data["text_clean"])

TfidfVectorizer(max_df=0.3, min_df=10, ngram_range=(1, 2))

In [None]:
# Vector representation of vocabulary
word_vector = pd.Series(vectorizer.vocabulary_).sample(5, random_state=1)
print(f"Unique word (ngram) vector extract:\n\n{word_vector}")

Unique word (ngram) vector extract:

wipe              20467
of medical        11879
my grandfather    11148
can someone        2715
crop               3940
dtype: int64


In [None]:
# Splitting the data for training process
train, test = train_test_split(data, random_state=1, test_size=0.25)

X_train = train["text_clean"]
Y_train = train["label"]
X_test = test["text_clean"]
Y_test = test["label"]

print(X_train.shape)
print(X_test.shape)

(44826,)
(14942,)


In [None]:
# Applying the vectorizer on the input data
X_train_vec = vectorizer.transform(X_train)
X_test_vec = vectorizer.transform(X_test)

split_index = [-1 if x in X_train.index else 0 for x in data["text_clean"].index]
pds = PredefinedSplit(test_fold = split_index)

In [None]:
# Trying different models to test which one can perform better on this dataset
classifiers = [
    LogisticRegression(solver="sag", random_state=1),
    KNeighborsClassifier(n_neighbors=3),
    RandomForestClassifier(random_state=1),
    XGBClassifier(random_state=1),
    MLPClassifier(
        random_state=1,
        solver="adam",
        hidden_layer_sizes=(12, 12, 12),
        activation="relu",
        early_stopping=True,
        n_iter_no_change=1,
    ),
]
# get names of the objects in list (too lazy for c&p...)
names = [re.match(r"[^\(]+", name.__str__())[0] for name in classifiers]
print(f"Classifiers to test: {names}")

Classifiers to test: ['LogisticRegression', 'KNeighborsClassifier', 'RandomForestClassifier', 'XGBClassifier', 'MLPClassifier']


In [None]:
# training the classifiers and sacing the result in a list to print it later
results = {}
for name, clf in zip(names, classifiers):
    print(f"Training classifier: {name}")
    clf.fit(X_train_vec, Y_train)
    prediction = clf.predict(X_test_vec)
    report = sklearn.metrics.classification_report(Y_test, prediction)
    results[name] = report

Training classifier: LogisticRegression
Training classifier: KNeighborsClassifier
Training classifier: RandomForestClassifier
Training classifier: XGBClassifier
Training classifier: MLPClassifier


In [None]:
# Based on the results, logistic regression seems to be the best performing classifier among them, so it will be used and a trial will be done on XGBclassifier too.
for k, v in results.items():
    print(f"Results for {k}:")
    print(f"{v}\n")

Results for LogisticRegression:
              precision    recall  f1-score   support

           0       0.83      0.82      0.82      8035
           1       0.80      0.80      0.80      6907

    accuracy                           0.81     14942
   macro avg       0.81      0.81      0.81     14942
weighted avg       0.81      0.81      0.81     14942


Results for KNeighborsClassifier:
              precision    recall  f1-score   support

           0       0.54      1.00      0.70      8035
           1       0.88      0.00      0.01      6907

    accuracy                           0.54     14942
   macro avg       0.71      0.50      0.35     14942
weighted avg       0.70      0.54      0.38     14942


Results for RandomForestClassifier:
              precision    recall  f1-score   support

           0       0.79      0.80      0.80      8035
           1       0.76      0.76      0.76      6907

    accuracy                           0.78     14942
   macro avg       0.78 

## XGBClassifier

In [None]:
# feature creation and modelling in a single function
pipe = Pipeline([("tfidf", TfidfVectorizer()), ("XGB", XGBClassifier(random_state=1))])

# define parameter space to test # runtime 35min
params = {
    "tfidf__analyzer" : ['word','char'],
    "tfidf__ngram_range": [(1, 2), (1, 3)],
    "tfidf__max_df": np.arange(0.3, 0.8),
    "tfidf__min_df": np.arange(5, 100),
}
# it is quite slow so we do 4 for now
pipe_clf = RandomizedSearchCV(
    pipe, params, cv = pds, n_jobs=-1, scoring="roc_auc", n_iter=3)
# pipe_clf.fit(X_train, Y_train)
pipe_clf.fit(data['text_clean'], data['label'])
print('best score {}'.format(pipe_clf.best_score_))
print('best params {}'.format(pipe_clf.best_params_))

best score 0.8079163188253051
best params {'tfidf__ngram_range': (1, 3), 'tfidf__min_df': 50, 'tfidf__max_df': 0.3, 'tfidf__analyzer': 'char'}


In [None]:
# feature creation and modelling in a single function
pipe = Pipeline([("tfidf", TfidfVectorizer()), ("XGB", XGBClassifier(random_state=1))])

# define parameter space to test # runtime 19min
params = {
    "tfidf__analyzer" : ['char'],
    "tfidf__ngram_range": [(1, 3)],
    "tfidf__max_df": [0.3],
    "tfidf__min_df": [50],
    "XGB__n_neighbors": [1,3,5],
    "XGB__leaf_size": [20,30]
}
pipe_XGB_clf = RandomizedSearchCV(pipe, params, cv=pds, n_jobs=-1, scoring="roc_auc",  n_iter=3)
pipe_XGB_clf.fit(data['text_clean'], data['label'])
print('best score {}'.format(pipe_XGB_clf.best_score_))
print('best params {}'.format(pipe_XGB_clf.best_params_))

best score 0.8079163188253051
best params {'tfidf__ngram_range': (1, 3), 'tfidf__min_df': 50, 'tfidf__max_df': 0.3, 'tfidf__analyzer': 'char', 'XGB__n_neighbors': 5, 'XGB__leaf_size': 20}


## Logistic Regression

In [None]:
# feature creation and modelling in a single function
pipe = Pipeline([("tfidf", TfidfVectorizer()), ("LGR",LogisticRegression(random_state=1))])

# define parameter space to test # runtime 35min
params = {
    "tfidf__analyzer" : ['word','char'],
    "tfidf__ngram_range": [(1, 2), (1, 3)],
    "tfidf__max_df": np.arange(0.3, 0.8),
    "tfidf__min_df": np.arange(5, 100),
}
# it is quite slow so we do 4 for now
pipe_LGR_clf = RandomizedSearchCV(
    pipe, params, cv=pds, n_jobs=-1, scoring="roc_auc", n_iter=3)
pipe_LGR_clf.fit(data['text_clean'], data['label'])
print('best score {}'.format(pipe_LGR_clf.best_score_))
print('best params {}'.format(pipe_LGR_clf.best_params_))

best score 0.879967594097655
best params {'tfidf__ngram_range': (1, 3), 'tfidf__min_df': 33, 'tfidf__max_df': 0.3, 'tfidf__analyzer': 'word'}


In [None]:
# feature creation and modelling in a single function
pipe = Pipeline([("tfidf", TfidfVectorizer()), ("LGR", LogisticRegression(random_state=1))])

# define parameter space to test # runtime 19min
params = {
    "tfidf__analyzer" : ['word'],
    "tfidf__ngram_range": [(1, 3)],
    "tfidf__max_df": [0.3],
    "tfidf__min_df": [33],
    "LGR__penalty" : ['l2'],
    "LGR__solver"  : ['newton-cg', 'lbfgs', 'liblinear'],
    "LGR__C"       : [0.0001, 0.001, 0.1, 1, 10, 100]
}
pipe_LGR_clfer = RandomizedSearchCV(pipe, params, cv=pds, n_jobs=-1, scoring="roc_auc",  n_iter=3)
pipe_LGR_clfer.fit(data['text_clean'], data['label'])
print('best score {}'.format(pipe_LGR_clfer.best_score_))
print('best params {}'.format(pipe_LGR_clfer.best_params_))

best score 0.874839320764242
best params {'tfidf__ngram_range': (1, 3), 'tfidf__min_df': 33, 'tfidf__max_df': 0.3, 'tfidf__analyzer': 'word', 'LGR__solver': 'liblinear', 'LGR__penalty': 'l2', 'LGR__C': 10}


***Colclusion: -***
* Logistic Regression is the best performing model among the ones tested it got accuracy of 87% on validation dataset and 85% on the test dataset (Kaggle's accuracy result)

# Testing

In [None]:
# Reading test data
test_data = pd.read_csv('/content/x_test.csv', sep=",", na_values=[""])

In [None]:
# Applying the text preprocessin on the test data
test_data["text"] = test_data["text"].map(
    lambda x: clean_text(x) if isinstance(x, str) else x
)

In [None]:
# # Kaggle submission
# submission = pd.DataFrame()

# submission['id'] = test_data['id']

# submission['label'] = pipe_LGR_clfer.predict_proba(test_data['text'])[:,1]

# submission.to_csv('4thclf_sample_submission_walkthrough.csv', index=False)