In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import numpy as np

In [14]:
# Defining feature extraction and preprocessing steps
text_feature = 'text'

# Splitting data into train and test sets
train=pd.read_csv('../Datasets/fake-news/train.csv')


In [24]:
train=train[~train['text'].isna()]

In [25]:
snowballStemmer = SnowballStemmer("english")
stop= stopwords.words("english")

def tokenize(text):
    return text.split()
def tokenize_snowball(text):
    return [snowballStemmer.stem(word) for word in text.split()]

In [26]:
pipeline = Pipeline([
    ("vect", TfidfVectorizer()),
    ("clf", RandomForestClassifier())
])

print('fitting')
# Fitting the pipeline on the training data
pipeline.fit(train['text'],train['label'])
print('fitted')



fitting
fitted


In [27]:
test=pd.read_csv('../Datasets/fake-news/test.csv')
test=pd.merge(test,pd.read_csv('../Datasets/fake-news/submit.csv'), left_on='id', right_on='id')
test=test[~test['text'].isna()]
test['pred_NB1']= pipeline.predict(test['text'])
print(accuracy_score(test['label'], test['pred_NB1']))

0.665896398998652


# Adding all features

In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [29]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.compose import ColumnTransformer

In [30]:
# Defining feature extraction and preprocessing steps
text_feature = 'text'
categorical_features = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise','word_count', 'stopword_count', 'prop_stop', 'caps_rel', 'ratio_long_words']

train=pd.read_csv('../Datasets/fake-news/train.csv')
train=train[(~train['text'].isna())|(~train['title'].isna())]
# Splitting data into train and test sets
train=pd.merge(train,pd.read_csv('../Datasets/fake-news/scores_emotion.csv'), right_on='id', left_on='id')
train=pd.merge(train,pd.read_csv('../Datasets/fake-news/stop_words.csv'), right_on='id', left_on='id')
train=pd.merge(train,pd.read_csv('../datasets/fake-news/long_train.csv'), right_on='id', left_on='id')
train=pd.merge(train,pd.read_csv('../datasets/fake-news/caps_train.csv'), right_on='id', left_on='id')

test=pd.read_csv('../Datasets/fake-news/test.csv')
test=test[~test['text'].isna()|~test['title'].isna()]
test=pd.merge(test,pd.read_csv('../Datasets/fake-news/scores_emotion_test.csv'), right_on='id', left_on='id')
test=pd.merge(test,pd.read_csv('../Datasets/fake-news/stop_words_test.csv'), right_on='id', left_on='id')
test=pd.merge(test,pd.read_csv('../datasets/fake-news/long_test.csv'), right_on='id', left_on='id')
test=pd.merge(test,pd.read_csv('../datasets/fake-news/caps_test.csv'), right_on='id', left_on='id')
train = train.drop('author', axis=1)
test = test.drop('author', axis=1)
train=train[train['word_count']>0]
test=test[test['word_count']>0]
test=pd.merge(test,pd.read_csv('../Datasets/fake-news/submit.csv'), left_on='id', right_on='id')


In [31]:
train['caps_rel']=train['Number of Caps']/train['word_count']
test['caps_rel']=test['Number of Caps']/test['word_count']

In [33]:
# Build a ColumnTransformer with FeatureUnion
text_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer())
])

num_pipeline = Pipeline([
    ('discretizer', KBinsDiscretizer(n_bins=3, encode='onehot-dense', strategy='kmeans'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, categorical_features),
    ('text', text_pipeline, text_feature)
])

# Define the Naive Bayes classifier
rf = RandomForestClassifier()

# Creating the final pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('rf', rf)
])
print('fitting')
# Fitting the pipeline on the training data
pipeline.fit(train,train['label'])
print('fitted')

# Predicting on the test data
test['pred'] = pipeline.predict(test)

# Evaluating the performance
print(accuracy_score(test['label'], test['pred']))


fitting
fitted
0.6527777777777778


# Emotion

In [34]:
categorical_features = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']

In [35]:
# Build a ColumnTransformer with FeatureUnion
text_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer())
])

num_pipeline = Pipeline([
    ('discretizer', KBinsDiscretizer(n_bins=3, encode='onehot-dense', strategy='kmeans'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, categorical_features),
    ('text', text_pipeline, text_feature)
])

# Define the Naive Bayes classifier
rf = RandomForestClassifier()

# Creating the final pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('rf', rf)
])
print('fitting')
# Fitting the pipeline on the training data
pipeline.fit(train,train['label'])
print('fitted')

# Predicting on the test data
test['pred'] = pipeline.predict(test)

# Evaluating the performance
print(accuracy_score(test['label'], test['pred']))


fitting
fitted
0.6513888888888889


#  Word count

In [36]:
categorical_features = ['word_count']

In [37]:
# Build a ColumnTransformer with FeatureUnion
text_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer())
])

num_pipeline = Pipeline([
    ('discretizer', KBinsDiscretizer(n_bins=3, encode='onehot-dense', strategy='kmeans'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, categorical_features),
    ('text', text_pipeline, text_feature)
])

# Define the Naive Bayes classifier
rf = RandomForestClassifier()

# Creating the final pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('rf', rf)
])
print('fitting')
# Fitting the pipeline on the training data
pipeline.fit(train,train['label'])
print('fitted')

# Predicting on the test data
test['pred'] = pipeline.predict(test)

# Evaluating the performance
print(accuracy_score(test['label'], test['pred']))


fitting
fitted
0.6545634920634921


# Stop Words

In [38]:
categorical_features = ['stopword_count', 'prop_stop']


In [39]:
# Build a ColumnTransformer with FeatureUnion
text_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer())
])

num_pipeline = Pipeline([
    ('discretizer', KBinsDiscretizer(n_bins=3, encode='onehot-dense', strategy='kmeans'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, categorical_features),
    ('text', text_pipeline, text_feature)
])

# Define the Naive Bayes classifier
rf = RandomForestClassifier()

# Creating the final pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('rf', rf)
])
print('fitting')
# Fitting the pipeline on the training data
pipeline.fit(train,train['label'])
print('fitted')

# Predicting on the test data
test['pred'] = pipeline.predict(test)

# Evaluating the performance
print(accuracy_score(test['label'], test['pred']))


fitting
fitted
0.653968253968254


# Long Words

In [40]:
categorical_features = [ 'ratio_long_words']


In [41]:
# Build a ColumnTransformer with FeatureUnion
text_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer())
])

num_pipeline = Pipeline([
    ('discretizer', KBinsDiscretizer(n_bins=3, encode='onehot-dense', strategy='kmeans'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, categorical_features),
    ('text', text_pipeline, text_feature)
])

# Define the Naive Bayes classifier
rf = RandomForestClassifier()

# Creating the final pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('rf', rf)
])
print('fitting')
# Fitting the pipeline on the training data
pipeline.fit(train,train['label'])
print('fitted')

# Predicting on the test data
test['pred'] = pipeline.predict(test)

# Evaluating the performance
print(accuracy_score(test['label'], test['pred']))


fitting
fitted
0.6545634920634921


# All Emotions

In [43]:
categorical_features = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise','word_count', 'stopword_count', 'prop_stop', 'caps_rel', 'ratio_long_words']


In [44]:
# Build a ColumnTransformer with FeatureUnion
text_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=tokenize, ngram_range=(1, 1), stop_words=stop))
])

num_pipeline = Pipeline([
    ('discretizer', KBinsDiscretizer(n_bins=3, encode='onehot-dense', strategy='kmeans'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, categorical_features),
    ('text', text_pipeline, text_feature)
])

# Define the Naive Bayes classifier
rf = RandomForestClassifier()

# Creating the final pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('rf', rf)
])
print('fitting')
# Fitting the pipeline on the training data
pipeline.fit(train,train['label'])
print('fitted')

# Predicting on the test data
test['pred'] = pipeline.predict(test)

# Evaluating the performance
print(accuracy_score(test['label'], test['pred']))


fitting
fitted
0.6460317460317461
