In [1]:
import pandas as pd

In [2]:
cleaned_stemmed = pd.read_csv("995,000_cleaned_stemmed2.csv", dtype=str)

# remove the rows with missing values in the 'type' column and in the 'content' column
cleaned_stemmed = cleaned_stemmed.dropna(subset=['type', 'content'])

# replace 'political' and 'clickbait' with 'reliable'
cleaned_stemmed['type'] = cleaned_stemmed['type'].replace(['political', 'clickbait'], 'reliable')

# replace 'bias' and 'satire' with 'fake'
cleaned_stemmed['type'] = cleaned_stemmed['type'].replace(['bias', 'satire'], 'fake')

#  remove all the other types of news except 'reliable' and 'fake'
cleaned_stemmed = cleaned_stemmed[cleaned_stemmed['type'].isin(['reliable', 'fake'])]


print('Total rows after removing missing values:', cleaned_stemmed.shape[0])

Total rows after removing missing values: 691768


In [5]:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [11]:
#CV = CountVectorizer()
#X = CV.fit_transform(cleaned_stemmed['content'])
tfidvec = TfidfVectorizer(stop_words='english',max_df=0.7) #max_df=0.7 means that we ignore terms that appear in more than 70% of the documents, and only focus on the less frequent ones, which could be more informative
X = tfidvec.fit_transform(cleaned_stemmed['content'])

X_train, X_test_1, y_train, y_test_1  = train_test_split(X, cleaned_stemmed['type'], test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test_1, y_test_1, test_size=0.5, random_state=42)

In [12]:
# Normalizing vectors
from sklearn.preprocessing import StandardScaler

scale = StandardScaler(with_mean=False)

X_train = scale.fit_transform(X_train)
X_val = scale.transform(X_val)


In [13]:
from sklearn import svm
import numpy as np

In [19]:
from sklearn import neural_network

#lsvc = svm.LinearSVC(max_iter=3500, random_state=42)
mlpc = neural_network.MLPClassifier(max_iter=3500, random_state=42, hidden_layer_sizes=(128, 64))
print("Model initialized")

#lsvc.fit(X_train, y_train)
mlpc.fit(X_train, y_train)
print("Model fitted")

y_pred = mlpc.predict(X_test)

Model initialized
Model fitted


In [20]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
accuracy = accuracy_score(y_val, y_pred)
print("accuracy: ", accuracy)

accuracy:  0.48783555227893666


In [22]:
# Finding optimal parameters
from sklearn.model_selection import GridSearchCV

mlpc2 = neural_network.MLPClassifier()

parameter_space = {
    'hidden_layer_sizes': [(128, 64), (100, 50)],
    'max_iter': [100, 500, 3500],
    'activation': ['tanh', 'relu'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant', 'adaptive'],
}

clf = GridSearchCV(mlpc2, parameter_space, n_jobs=-1, cv=3)
clf.fit(X_train, y_train)

print('Best parameters found:\n', clf.best_params_)



KeyboardInterrupt: 