In [65]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
import pickle

In [52]:
df = pd.read_csv('archive/train.csv')
test = pd.read_csv('archive/test.csv')

In [3]:
df

Unnamed: 0,profile pic,nums/length username,fullname words,nums/length fullname,name==username,description length,external URL,private,#posts,#followers,#follows,fake
0,1,0.27,0,0.00,0,53,0,0,32,1000,955,0
1,1,0.00,2,0.00,0,44,0,0,286,2740,533,0
2,1,0.10,2,0.00,0,0,0,1,13,159,98,0
3,1,0.00,1,0.00,0,82,0,0,679,414,651,0
4,1,0.00,2,0.00,0,0,0,1,6,151,126,0
...,...,...,...,...,...,...,...,...,...,...,...,...
571,1,0.55,1,0.44,0,0,0,0,33,166,596,1
572,1,0.38,1,0.33,0,21,0,0,44,66,75,1
573,1,0.57,2,0.00,0,0,0,0,4,96,339,1
574,1,0.57,1,0.00,0,11,0,0,0,57,73,1


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 576 entries, 0 to 575
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   profile pic           576 non-null    int64  
 1   nums/length username  576 non-null    float64
 2   fullname words        576 non-null    int64  
 3   nums/length fullname  576 non-null    float64
 4   name==username        576 non-null    int64  
 5   description length    576 non-null    int64  
 6   external URL          576 non-null    int64  
 7   private               576 non-null    int64  
 8   #posts                576 non-null    int64  
 9   #followers            576 non-null    int64  
 10  #follows              576 non-null    int64  
 11  fake                  576 non-null    int64  
dtypes: float64(2), int64(10)
memory usage: 54.1 KB


In [47]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   profile pic           120 non-null    int64  
 1   nums/length username  120 non-null    float64
 2   fullname words        120 non-null    int64  
 3   nums/length fullname  120 non-null    float64
 4   name==username        120 non-null    int64  
 5   description length    120 non-null    int64  
 6   external URL          120 non-null    int64  
 7   private               120 non-null    int64  
 8   #posts                120 non-null    int64  
 9   #followers            120 non-null    int64  
 10  #follows              120 non-null    int64  
 11  fake                  120 non-null    int64  
dtypes: float64(2), int64(10)
memory usage: 11.4 KB


In [55]:
X_train = df.drop(['fake'],axis=1)
y_train = df['fake']
X_test = test.drop(['fake'],axis=1)
y_test = test['fake']

In [56]:
X_train = MinMaxScaler().fit_transform(X_train)

In [57]:
X_test = MinMaxScaler().fit_transform(X_test)

In [58]:
models = [
    {
        "name": "Logistic Regression",
        "model": LogisticRegression(),
        "params": {
            "C": [0.1,1.0,2.0,5.0],
            "max_iter": [500,1000]
        }
    },
    {
        "name": "Random Forest Classifier",
        "model": RandomForestClassifier(),
        "params": {
            "criterion": ["gini", "entropy"],
            "n_estimators": [50, 100, 200]
        }
    },
    {
        "name": "Ada Boost Classifier",
        "model": AdaBoostClassifier(),
        "params": {
            "n_estimators": [50,100,200]
        }
    }
]

In [63]:
for item in models:
    clf = GridSearchCV(item['model'],item['params'],cv=5)
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print(f"{item["name"]}, parameters={clf.best_params_}")
    print(classification_report(y_test,y_pred))
    print("")

Logistic Regression, parameters={'C': 2.0, 'max_iter': 500}
              precision    recall  f1-score   support

           0       0.86      0.92      0.89        60
           1       0.91      0.85      0.88        60

    accuracy                           0.88       120
   macro avg       0.89      0.88      0.88       120
weighted avg       0.89      0.88      0.88       120


Random Forest Classifier, parameters={'criterion': 'entropy', 'n_estimators': 200}
              precision    recall  f1-score   support

           0       0.82      0.98      0.89        60
           1       0.98      0.78      0.87        60

    accuracy                           0.88       120
   macro avg       0.90      0.88      0.88       120
weighted avg       0.90      0.88      0.88       120


Ada Boost Classifier, parameters={'n_estimators': 200}
              precision    recall  f1-score   support

           0       0.84      0.98      0.91        60
           1       0.98      0.82    

In [64]:
adaBoost = AdaBoostClassifier(n_estimators=200)
adaBoost.fit(X_train,y_train)

In [68]:
filename = 'model.sav'
pickle.dump(adaBoost, open(filename,'wb'))

In [70]:
loaded_model = pickle.load(open(filename,'rb'))