In [4]:
import pandas as pd
import warnings
from google.colab import drive
drive.mount('/content/drive')
warnings.filterwarnings('ignore')

Mounted at /content/drive


In [None]:
data = pd.read_csv('/content/drive/MyDrive/Notebooks/WELFake_Dataset.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [None]:
data.shape

(72134, 4)

In [None]:
data_label_1 = data[data['label'] == 1].sample(n=10000, random_state=42)
data_label_0 = data[data['label'] == 0].sample(n=10000, random_state=42)

data = pd.concat([data_label_1, data_label_0], axis=0)
data = data.sample(frac=1, random_state=42).reset_index(drop=True)
data.shape

(20000, 4)

In [None]:
data['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,10000
1,10000


#merge title and text into one feature

In [None]:
df = data.copy()
df['text'] = df['title'] + df['text']
df.drop(['title','Unnamed: 0'],axis=1,inplace=True)
df.head()

Unnamed: 0,text,label
0,7 Reasons Why Obamacare 2.0 Is All But Guarant...,0
1,Angry Voters Trying to Flee Country Over Presi...,1
2,BLACK LEADER CALLS ON BLACK CAUCUS MEMBERS TO ...,1
3,Classless Obama Refuses Photo-Op of White Hous...,1
4,German parties in coalition talks agree on no ...,0


In [None]:
df.isna().sum()

Unnamed: 0,0
text,144
label,0


In [None]:
df.dropna(inplace=True)
df.shape

(19856, 2)

In [None]:
df.head(10)

Unnamed: 0,text,label
0,7 Reasons Why Obamacare 2.0 Is All But Guarant...,0
1,Angry Voters Trying to Flee Country Over Presi...,1
2,BLACK LEADER CALLS ON BLACK CAUCUS MEMBERS TO ...,1
3,Classless Obama Refuses Photo-Op of White Hous...,1
4,German parties in coalition talks agree on no ...,0
5,"Sky Poll Alleging Britain Is ’More Racist, Les...",0
6,Anti-Travel Ban Lawyer Leans on Argument that ...,0
7,Comment on A Group Of Reluctant Men Hold Kitte...,1
8,Homeland Security deal: Will Boehner follow Mc...,0
9,Un muerto y cuatro heridos dejan protestas vio...,1


In [None]:
import re
import spacy
nlp = spacy.load('en_core_web_sm')

def preprocess(text):
  text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
  text = re.sub(r'[^A-Za-z0-9\s]', '', text)
  text = re.sub(r'\s+', ' ', text).strip()

  text = text.lower()
  doc = nlp(text)

  res = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]


  return ' '.join(res)

In [None]:
df['processed_text'] = df['text'].apply(preprocess)

df.head()

Unnamed: 0,text,label,processed_text
0,7 Reasons Why Obamacare 2.0 Is All But Guarant...,0,7 reason obamacare 20 guarantee impose crush c...
1,Angry Voters Trying to Flee Country Over Presi...,1,angry voter try flee country presidential elec...
2,BLACK LEADER CALLS ON BLACK CAUCUS MEMBERS TO ...,1,black leader call black caucus member resign p...
3,Classless Obama Refuses Photo-Op of White Hous...,1,classless obama refuse photoop white house wel...
4,German parties in coalition talks agree on no ...,0,german party coalition talk agree new debtberl...


In [None]:
df.to_csv('/content/drive/MyDrive/Notebooks/processed_WELFake_Dataset.csv', index=False)

NameError: name 'df' is not defined

In [5]:
df = pd.read_csv('/content/drive/MyDrive/Notebooks/processed_WELFake_Dataset.csv')
df.head()

Unnamed: 0,text,label,processed_text
0,7 Reasons Why Obamacare 2.0 Is All But Guarant...,0,7 reason obamacare 20 guarantee impose crush c...
1,Angry Voters Trying to Flee Country Over Presi...,1,angry voter try flee country presidential elec...
2,BLACK LEADER CALLS ON BLACK CAUCUS MEMBERS TO ...,1,black leader call black caucus member resign p...
3,Classless Obama Refuses Photo-Op of White Hous...,1,classless obama refuse photoop white house wel...
4,German parties in coalition talks agree on no ...,0,german party coalition talk agree new debtberl...


In [6]:
df['processed_text'].isna().sum()

2

In [7]:
df.dropna(inplace=True)
df.shape

(19854, 3)

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.naive_bayes import MultinomialNB

def model(x_train, x_test, y_train, y_test):
  LR = LogisticRegression()
  svc = SVC()
  KNN = KNeighborsClassifier()
  DT = DecisionTreeClassifier()
  GB = GradientBoostingClassifier()
  ADA = AdaBoostClassifier()
  XGB = XGBClassifier()
  RF = RandomForestClassifier(n_estimators=100, random_state=42)
  NB = MultinomialNB()


  LR.fit(x_train, y_train)
  svc.fit(x_train,y_train)
  KNN.fit(x_train,y_train)
  DT.fit(x_train,y_train)
  GB.fit(x_train,y_train)
  ADA.fit(x_train,y_train)
  XGB.fit(x_train,y_train)
  RF.fit(x_train, y_train)
  NB.fit(x_train, y_train)

  LR_y_pred = LR.predict(x_test)
  SVC_y_pred = svc.predict(x_test)
  KNN_y_pred = KNN.predict(x_test)
  DT_y_pred = DT.predict(x_test)
  GB_y_pred = GB.predict(x_test)
  ADA_y_pred = ADA.predict(x_test)
  XGB_y_pred = XGB.predict(x_test)
  RF_y_pred = RF.predict(x_test)
  NB_y_pred = NB.predict(x_test)

  print('Logistic Regression')
  print(accuracy_score(y_test, LR_y_pred))
  print(classification_report(y_test, LR_y_pred))
  print()

  print('SVC')
  print(accuracy_score(y_test, SVC_y_pred))
  print(classification_report(y_test, SVC_y_pred))
  print()

  print('KNN')
  print(accuracy_score(y_test, KNN_y_pred))
  print(classification_report(y_test, KNN_y_pred))
  print()

  print('Decision Tree')
  print(accuracy_score(y_test, DT_y_pred))
  print(classification_report(y_test, DT_y_pred))
  print()

  print('Gradient Boosting')
  print(accuracy_score(y_test, GB_y_pred))
  print(classification_report(y_test, GB_y_pred))
  print()

  print('AdaBoost')
  print(accuracy_score(y_test, ADA_y_pred))
  print(classification_report(y_test, ADA_y_pred))
  print()

  print('XGBoost')
  print(accuracy_score(y_test, XGB_y_pred))
  print(classification_report(y_test, XGB_y_pred))

  print('Random Forest')
  print(accuracy_score(y_test, RF_y_pred))
  print(classification_report(y_test, RF_y_pred))

  print('Naive Bayes')
  print(accuracy_score(y_test, NB_y_pred))
  print(classification_report(y_test, NB_y_pred))

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer(max_features=10000)
x = vec.fit_transform(df['processed_text'])
y = df['label']

In [10]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42,stratify=y)

In [None]:
# model(x_train, x_test, y_train, y_test)

Logistic Regression
0.9244522790229162
              precision    recall  f1-score   support

           0       0.93      0.92      0.92      2000
           1       0.92      0.93      0.92      1971

    accuracy                           0.92      3971
   macro avg       0.92      0.92      0.92      3971
weighted avg       0.92      0.92      0.92      3971


SVC
0.9385545202719718
              precision    recall  f1-score   support

           0       0.95      0.93      0.94      2000
           1       0.93      0.95      0.94      1971

    accuracy                           0.94      3971
   macro avg       0.94      0.94      0.94      3971
weighted avg       0.94      0.94      0.94      3971


KNN
0.7990430622009569
              precision    recall  f1-score   support

           0       0.80      0.80      0.80      2000
           1       0.80      0.80      0.80      1971

    accuracy                           0.80      3971
   macro avg       0.80      0.80      0.

In [11]:
from sklearn.ensemble import StackingClassifier

base_model = [
    ('LR',LogisticRegression()),
    ('SVC',SVC()),
    ('GB',GradientBoostingClassifier()),
    ('XGB',XGBClassifier())
]

meta_model = XGBClassifier()

sc = StackingClassifier(estimators=base_model, final_estimator=meta_model)
sc.fit(x_train, y_train)
y_pred = sc.predict(x_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9579451019894233
              precision    recall  f1-score   support

           0       0.96      0.95      0.96      2000
           1       0.95      0.96      0.96      1971

    accuracy                           0.96      3971
   macro avg       0.96      0.96      0.96      3971
weighted avg       0.96      0.96      0.96      3971



###TfidfVectorizer with grams

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer(max_features=10000, ngram_range=(1,3))
x = vec.fit_transform(df['processed_text'])
y = df['label']

In [13]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42,stratify=y)

In [15]:
model(x_train, x_test, y_train, y_test)

Logistic Regression
0.928733316544951
              precision    recall  f1-score   support

           0       0.94      0.92      0.93      2000
           1       0.92      0.94      0.93      1971

    accuracy                           0.93      3971
   macro avg       0.93      0.93      0.93      3971
weighted avg       0.93      0.93      0.93      3971


SVC
0.9400654746915135
              precision    recall  f1-score   support

           0       0.95      0.93      0.94      2000
           1       0.93      0.95      0.94      1971

    accuracy                           0.94      3971
   macro avg       0.94      0.94      0.94      3971
weighted avg       0.94      0.94      0.94      3971


KNN
0.778393351800554
              precision    recall  f1-score   support

           0       0.86      0.66      0.75      2000
           1       0.72      0.89      0.80      1971

    accuracy                           0.78      3971
   macro avg       0.79      0.78      0.78

In [14]:
from sklearn.ensemble import StackingClassifier

base_model = [
    ('LR',LogisticRegression()),
    ('SVC',SVC()),
    ('GB',GradientBoostingClassifier()),
    ('XGB',XGBClassifier())
]

meta_model = XGBClassifier()

sc = StackingClassifier(estimators=base_model, final_estimator=meta_model)
sc.fit(x_train, y_train)
y_pred = sc.predict(x_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9556786703601108
              precision    recall  f1-score   support

           0       0.96      0.95      0.96      2000
           1       0.95      0.96      0.96      1971

    accuracy                           0.96      3971
   macro avg       0.96      0.96      0.96      3971
weighted avg       0.96      0.96      0.96      3971

