In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import re
import string
from langdetect import detect

In [2]:
df_fake = pd.read_csv(r'C:\Users\alfre\Downloads\Fake.csv.csv')
df_true = pd.read_csv(r'C:\Users\alfre\Downloads\True.csv.csv')


In [3]:
df_fake["class"] = 0
df_true["class"] = 1

In [17]:
df_fake_manual_testing = df_fake.head(10)

In [5]:
df_true_manual_testing = df_true.tail(10)

In [6]:
df_fake = df_fake.iloc[:-10]

In [7]:
df_true = df_true.iloc[:-10]

In [8]:
data_merge = pd.concat([df_fake, df_true], axis=0)

In [9]:
data_merge = data_merge.sample(frac=1).reset_index(drop=True)

In [10]:
data = data_merge[["text", "class"]]

In [11]:
def detect_language(text):
    try:
        return detect(text)
    except:
        return "unknown"

In [13]:
def wordopt_multilang(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W", " ", text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    lang = detect_language(text)
    if lang == 'ta':  # Tamil
        text = re.sub(r'[^\u0000-\u007F\u0B80-\u0BFF]+', '', text)
    elif lang == 'hi':  # Hindi
        text = re.sub(r'[^\u0000-\u007F\u0900-\u097F]+', '', text)
    
    return text

  text = re.sub('\[.*?\]', '', text)
  text = re.sub('https?://\S+|www\.\S+', '', text)
  text = re.sub('\w*\d\w*', '', text)


In [14]:
data["text"] = data["text"].apply(wordopt_multilang)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["text"] = data["text"].apply(wordopt_multilang)


In [15]:
vectorization = TfidfVectorizer()
x = vectorization.fit_transform(data["text"])


In [16]:
x_train, x_test, y_train, y_test = train_test_split(x, data["class"], test_size=0.2, random_state=42)


In [18]:
LR = LogisticRegression()
LR.fit(x_train, y_train)


In [21]:
DT = DecisionTreeClassifier()
DT.fit(x_train, y_train)

In [20]:
RF = RandomForestClassifier()
RF.fit(x_train, y_train)

In [22]:
import pickle

In [23]:
with open(r'C:\Users\alfre\OneDrive\Desktop\lang-jupi\lang-jupi\model_LR.pkl', 'wb') as file:
    pickle.dump(LR, file)

In [24]:
with open(r'C:\Users\alfre\OneDrive\Desktop\lang-jupi\lang-jupi\model_DT.pkl', 'wb') as file:
    pickle.dump(DT, file)

In [25]:
with open(r'C:\Users\alfre\OneDrive\Desktop\lang-jupi\lang-jupi\model_RF.pkl', 'wb') as file:
    pickle.dump(RF, file)

In [26]:
with open(r'C:\Users\alfre\OneDrive\Desktop\lang-jupi\lang-jupi\vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorization, file)

In [27]:
pred_LR = LR.predict(x_test)

In [28]:
pred_DT = DT.predict(x_test)

In [29]:
pred_RF = RF.predict(x_test)

In [30]:
print("Logistic Regression Accuracy:", accuracy_score(y_test, pred_LR))


Logistic Regression Accuracy: 0.9867424242424242


In [31]:
print("Decision Tree Accuracy:", accuracy_score(y_test, pred_DT))

Decision Tree Accuracy: 0.9967691622103387


In [32]:
print("Random Forest Accuracy:", accuracy_score(y_test, pred_RF))

Random Forest Accuracy: 0.9888591800356507


In [33]:

print("\nLogistic Regression Report:\n", classification_report(y_test, pred_LR))


Logistic Regression Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4670
           1       0.99      0.99      0.99      4306

    accuracy                           0.99      8976
   macro avg       0.99      0.99      0.99      8976
weighted avg       0.99      0.99      0.99      8976



In [34]:
print("\nDecision Tree Report:\n", classification_report(y_test, pred_DT))


Decision Tree Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      4670
           1       1.00      1.00      1.00      4306

    accuracy                           1.00      8976
   macro avg       1.00      1.00      1.00      8976
weighted avg       1.00      1.00      1.00      8976



In [35]:
print("\nRandom Forest Report:\n", classification_report(y_test, pred_RF))


Random Forest Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4670
           1       0.99      0.99      0.99      4306

    accuracy                           0.99      8976
   macro avg       0.99      0.99      0.99      8976
weighted avg       0.99      0.99      0.99      8976

