In [46]:
import pandas as pd
import cufflinks as cf
from IPython.core.display import HTML
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from collections import OrderedDict

In [47]:
df = pd.read_csv(r'news_articles.csv', encoding="latin", index_col=0)
df = df.dropna()

In [48]:
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

In [49]:
def convert(path):
    return '<img src="'+ path + '" width="80">'
df_sources = df[['site_url','label','main_img_url']]
df_r = df_sources.loc[df['label']== 'Real'].iloc[6:10,:]
df_f = df_sources.loc[df['label']== 'Fake'].head(6)
HTML(df_r.to_html(escape=False, formatters=dict(main_img_url=convert)))

Unnamed: 0_level_0,site_url,label,main_img_url
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Fed Up,100percentfedup.com,Real,
Fed Up,100percentfedup.com,Real,
Fed Up,100percentfedup.com,Real,
Fed Up,100percentfedup.com,Real,


In [50]:
df_sources = df[['site_url','label','main_img_url']]
df_r = df_sources.loc[df['label']== 'Real'].iloc[6:10,:]
df_f = df_sources.loc[df['label']== 'Fake'].head(6)
HTML(df_f.to_html(escape=False, formatters=dict(main_img_url=convert)))

Unnamed: 0_level_0,site_url,label,main_img_url
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No Author,21stcenturywire.com,Fake,
No Author,21stcenturywire.com,Fake,
Shawn Helton,21stcenturywire.com,Fake,
Mike Rivero,21stcenturywire.com,Fake,
No Author,21stcenturywire.com,Fake,
Shawn Helton,21stcenturywire.com,Fake,


In [51]:
type_label = {'Real': 0, 'Fake': 1}
type_mapping = {'bias': 0, 'conspiracy': 1, 'fake': 2, 'bs': 3, 'satire': 4, 'hate': 5, 'junksci': 6, 'state': 7}
df['label'] = df['label'].map(type_label)
df['type'] = df['type'].map(type_mapping)

In [52]:
def plot_bar(df, feat_x, feat_y, normalize=True):
    ct = pd.crosstab(df[feat_x], df[feat_y])
    if normalize:
        ct = ct.div(ct.sum(axis=1), axis=0)
    return ct.plot(kind='bar', stacked=True)


In [53]:
df_type = df[['site_url', 'type']]

# Sampling and shuffling the DataFrame
# df1 = df.sample(frac=1)


In [55]:
x = df1['text_without_stopwords']  # Feature: text_without_stopwords
y = df1['type']  # Label: type


unique_types = y.unique()
# print("Unique values in 'type' column:", unique_types)

class_mapping = {value: f'Type_{value}' for value in unique_types}

# Display the updated mapping dictionary
# print("Updated Mapping Dictionary:", class_mapping)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30)

# Create and fit the TF-IDF vectorizer
tfidf_vect = TfidfVectorizer(stop_words='english')
tfidf_train = tfidf_vect.fit_transform(x_train)

# Train your classifier (Adab in this case)
Adab = AdaBoostClassifier(DecisionTreeClassifier(max_depth=10), n_estimators=5, random_state=1)
Adab.fit(tfidf_train, y_train)



def classify_news_headline(headline, tfidf_vectorizer, adaboost_classifier):
    headline_processed = ' '.join(headline.split())
    headline_tfidf = tfidf_vectorizer.transform([headline_processed])
    prediction = adaboost_classifier.predict(headline_tfidf)[0]
    class_mapping = {0: 'Real', 1: 'Fake'}
    predicted_class = class_mapping.get(prediction, 'Unknown')
    return predicted_class

# Example usage:
headline_to_classify = "pin drop speech by father of daughter kidnapped and killed by isis i have voted for donald j trump  percentfedupcom"#sample headline from the dataset
classification_result = classify_news_headline(headline_to_classify, tfidf_vect, Adab)
print(f"The headline '{headline_to_classify}' is classified as: {classification_result}")

The headline 'pin drop speech by father of daughter kidnapped and killed by isis i have voted for donald j trump  percentfedupcom' is classified as: Real
