In [25]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import nltk

In [26]:
fake = pd.read_csv("data/Fake.csv")
true = pd.read_csv("data/True.csv")

In [27]:
# Add flag to track fake and real
fake['target'] = 'fake'
true['target'] = 'true'

In [28]:
# Concatenate dataframes
data = pd.concat([fake, true]).reset_index(drop = True)

# Shuffle the data
from sklearn.utils import shuffle
data = shuffle(data)
data = data.reset_index(drop=True)

In [29]:
# Removing the date (we won't use it for the analysis)
data.drop(["date", "text"],axis = 1,inplace = True)

# Convert to lowercase
data['title'] = data['title'].apply(lambda x: x.lower())

In [30]:
# Remove punctuation
import string

def punctuation_removal(text):
    all_list = [char for char in text if char not in string.punctuation]
    clean_str = ''.join(all_list)
    return clean_str

data['title'] = data['title'].apply(punctuation_removal)

In [31]:
# Removing stopwords
from nltk.corpus import stopwords
stop = stopwords.words('english')

data['title'] = data['title'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [32]:
# Split the data
X_train,X_test,y_train,y_test = train_test_split(data['title'], data.target, test_size=0.2, random_state=42)

In [33]:
# Vectorizing and applying TF-IDF
from sklearn.linear_model import LogisticRegression
pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', LogisticRegression())])
# Fitting the model
model = pipe.fit(X_train, y_train)

In [34]:
def input_func(text):
    lines2 = []
    lines2.append(text)
    df = pd.DataFrame({'text' : lines2}).astype(str)
    df['text'] = df['text'].apply(lambda x: x.lower())
    input_df = df.apply(punctuation_removal)

    prediction = model.predict(input_df)
    return(str(prediction)[2:6])

In [35]:
import pickle
pickle.dump(model, open('model.pkl', 'wb'))