In [None]:
import nltk
nltk.download('punkt')
import pandas as pd

In [None]:
fake = pd.read_csv("Fake.csv")
true = pd.read_csv("True.csv")

In [None]:
display(fake.info())

In [None]:
display(true.info())

In [None]:
display(fake.head(10))

In [None]:
display(true.head(10))

In [None]:
display(fake.subject.value_counts())
display(true.subject.value_counts())

In [None]:
fake['flag'] = 0
true['flag'] = 1

In [None]:
display(fake.flag.value_counts())
display(true.flag.value_counts())

In [None]:
data = pd.concat([fake, true], axis=0)

In [None]:
data = data.reset_index(drop=True)

In [None]:
data = data.drop(['title', 'subject', 'date'], axis=1)

In [None]:
data = pd.concat([fake, true], axis=0)

In [None]:
print(data.columns)

# TOKENIZATION

In [None]:
from nltk.tokenize import word_tokenize

In [None]:
data['text'] = data['text'].apply(word_tokenize)

In [None]:
print(data.head(10))

# STEMMING

In [None]:
from nltk.stem.snowball import SnowballStemmer
porter = SnowballStemmer("english")

In [None]:
def stem_it(text):
    return [porter.stem(word) for word in text]

In [None]:
data['text'] = data['text'].apply(stem_it)

In [None]:
print(data.head(10))

# STOPWORD REMOVAL

In [None]:
from nltk.corpus import stopwords

In [None]:
def stop_it(t):
    dt = [word for word in t if len(word)>2]
    return dt

In [None]:
data['text'] = data['text'].apply(stop_it)

In [None]:
print(data.head(10))

In [None]:
data['text'] = data['text'].apply(' '.join)

In [None]:
print(data.head(10))

# SPLITTING

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data['text'], data['flag'], test_size=0.25)

display(x_train.head())
print("\n")
display(y_train.head())

# VECTORIZATION

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
my_tfidf = TfidfVectorizer(max_df=0.7)

tfidf_train = my_tfidf.fit_transform(x_train)
tfidf_test = my_tfidf.transform(x_test)

In [None]:
print(tfidf_train)

# LOGISTIC REGRESSION

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [None]:
model_1 = LogisticRegression(max_iter=900)
model_1.fit(tfidf_train, y_train)
predict_1 = model_1.predict(tfidf_test)
accur_pc = accuracy_score(y_test, predict_1)

In [None]:
print(accur_pc*100)

# PASSIVE-AGGRESSIVE CLASSIFIER

In [None]:
from sklearn.linear_model import PassiveAggressiveClassifier

model = PassiveAggressiveClassifier(max_iter=50)
model.fit(tfidf_train, y_train)

In [None]:
y_predict = model.predict(tfidf_test)
accscore = accuracy_score(y_test, y_predict)
print(accscore*100)

In [None]:
user_input = input("Enter a news article : ")
user_input = word_tokenize(user_input) 
user_input = stem_it(user_input)
user_input = stop_it(user_input)

In [None]:
user_input = ' '.join(user_input)

In [None]:
# Vectorize the user input
user_feature = my_tfidf.transform([user_input])

# Predict the category of the user input
predicted_category_1 = model_1.predict_proba(user_feature)

In [None]:
print("Predicted category using Logistic Regression:", predicted_category_1)