In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords


In [2]:
import numpy as np

In [3]:
# Defining feature extraction and preprocessing steps
text_feature = 'text'

# Splitting data into train and test sets
train=pd.read_csv('../Datasets/fake-news/train.csv')
test=pd.read_csv('../Datasets/fake-news/test.csv')

In [4]:
train=train[~train['text'].isna()]

In [5]:
snowballStemmer = SnowballStemmer("english")
stop= stopwords.words("english")

def tokenize(text):
    return text.split()
def tokenize_snowball(text):
    return [snowballStemmer.stem(word) for word in text.split()]

In [6]:
pipeline = Pipeline([
    ("vect", TfidfVectorizer()),
    ("clf", LogisticRegression())
])

params = [{
    "vect__ngram_range": [(1,1), (1,2)],
    "vect__stop_words": [None, stop],
    "vect__tokenizer": [tokenize, tokenize_snowball],
    "clf__penalty": ['l1','l2']
}]

grid = GridSearchCV(
    estimator=pipeline,
    param_grid=params,
    scoring='accuracy',
    cv=5,
    verbose=1,
    n_jobs=4
)

grid.fit(train['text'], train['label'])
model1 = grid.best_estimator_

Fitting 5 folds for each of 16 candidates, totalling 80 fits


40 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\guill\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\guill\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\guill\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\guill\AppData\Roaming\Py

In [7]:
test=pd.merge(test,pd.read_csv('../Datasets/fake-news/submit.csv'), left_on='id', right_on='id')
test=test[~test['text'].isna()]
test['pred_NB1']= model1.predict(test['text'])
print(accuracy_score(test['label'], test['pred_NB1']))

0.642788368958213


In [8]:
test.to_csv('../Datasets/fake-news/pred_log.csv')

In [9]:
best_params = grid.best_params_

# Print the best parameters
print("Best Parameters:", best_params)

Best Parameters: {'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'm

In [10]:
from joblib import dump
dump(model1, '../models/log.joblib')


['../models/log.joblib']

In [None]:
3+2

# Adding all features

In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [26]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.compose import ColumnTransformer

In [37]:
# Defining feature extraction and preprocessing steps
text_feature = 'text'
categorical_features = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise','word_count', 'stopword_count', 'prop_stop', 'caps_rel', 'ratio_long_words']

train=pd.read_csv('../Datasets/fake-news/train.csv')
train=train[(~train['text'].isna())|(~train['title'].isna())]
# Splitting data into train and test sets
train=pd.merge(train,pd.read_csv('../Datasets/fake-news/scores_emotion.csv'), right_on='id', left_on='id')
train=pd.merge(train,pd.read_csv('../Datasets/fake-news/stop_words.csv'), right_on='id', left_on='id')
train=pd.merge(train,pd.read_csv('../datasets/fake-news/long_train.csv'), right_on='id', left_on='id')
train=pd.merge(train,pd.read_csv('../datasets/fake-news/caps_train.csv'), right_on='id', left_on='id')

test=pd.read_csv('../Datasets/fake-news/test.csv')
test=test[~test['text'].isna()|~test['title'].isna()]
test=pd.merge(test,pd.read_csv('../Datasets/fake-news/scores_emotion_test.csv'), right_on='id', left_on='id')
test=pd.merge(test,pd.read_csv('../Datasets/fake-news/stop_words_test.csv'), right_on='id', left_on='id')
test=pd.merge(test,pd.read_csv('../datasets/fake-news/long_test.csv'), right_on='id', left_on='id')
test=pd.merge(test,pd.read_csv('../datasets/fake-news/caps_test.csv'), right_on='id', left_on='id')
train = train.drop('author', axis=1)
test = test.drop('author', axis=1)
train=train[train['word_count']>0]
test=test[test['word_count']>0]
test=pd.merge(test,pd.read_csv('../Datasets/fake-news/submit.csv'), left_on='id', right_on='id')


In [38]:
train['caps_rel']=train['Number of Caps']/train['word_count']
test['caps_rel']=test['Number of Caps']/test['word_count']

In [39]:
# Build a ColumnTransformer with FeatureUnion
text_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=tokenize, ngram_range=(1, 1), stop_words=stop))
])

num_pipeline = Pipeline([
    ('discretizer', KBinsDiscretizer(n_bins=3, encode='onehot-dense', strategy='kmeans'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, categorical_features),
    ('text', text_pipeline, text_feature)
])

# Define the Naive Bayes classifier
logistic = LogisticRegression(penalty='l2')

# Creating the final pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('naive_bayes', logistic)
])
print('fitting')
# Fitting the pipeline on the training data
pipeline.fit(train,train['label'])
print('fitted')

# Predicting on the test data
test['pred'] = pipeline.predict(test)

# Evaluating the performance
print(accuracy_score(test['label'], test['pred']))


fitting


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


fitted
0.6301587301587301


In [40]:
dump(pipeline, '../models/log_2.joblib')
test.to_csv('../Datasets/fake-news/log2.csv')

# Emotion

In [34]:
categorical_features = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']

In [35]:
# Build a ColumnTransformer with FeatureUnion
text_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=tokenize, ngram_range=(1, 1), stop_words=stop))
])

num_pipeline = Pipeline([
    ('discretizer', KBinsDiscretizer(n_bins=3, encode='onehot-dense', strategy='kmeans'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, categorical_features),
    ('text', text_pipeline, text_feature)
])

# Define the Naive Bayes classifier
logistic = LogisticRegression(penalty='l2')

# Creating the final pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('naive_bayes', logistic)
])
print('fitting')
# Fitting the pipeline on the training data
pipeline.fit(train,train['label'])
print('fitted')

# Predicting on the test data
test['pred'] = pipeline.predict(test)

# Evaluating the performance
print(accuracy_score(test['label'], test['pred']))


fitting


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


fitted
0.6311507936507936


#  Word count

In [41]:
categorical_features = ['word_count']

In [42]:
# Build a ColumnTransformer with FeatureUnion
text_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=tokenize, ngram_range=(1, 1), stop_words=stop))
])

num_pipeline = Pipeline([
    ('discretizer', KBinsDiscretizer(n_bins=3, encode='onehot-dense', strategy='kmeans'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, categorical_features),
    ('text', text_pipeline, text_feature)
])

# Define the Naive Bayes classifier
logistic = LogisticRegression(penalty='l2')

# Creating the final pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('naive_bayes', logistic)
])
print('fitting')
# Fitting the pipeline on the training data
pipeline.fit(train,train['label'])
print('fitted')

# Predicting on the test data
test['pred'] = pipeline.predict(test)

# Evaluating the performance
print(accuracy_score(test['label'], test['pred']))

fitting


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


fitted
0.6311507936507936


# Stop Words

In [43]:
categorical_features = ['stopword_count', 'prop_stop']


In [44]:
# Build a ColumnTransformer with FeatureUnion
text_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=tokenize, ngram_range=(1, 1), stop_words=stop))
])

num_pipeline = Pipeline([
    ('discretizer', KBinsDiscretizer(n_bins=3, encode='onehot-dense', strategy='kmeans'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, categorical_features),
    ('text', text_pipeline, text_feature)
])

# Define the Naive Bayes classifier
logistic = LogisticRegression(penalty='l2')

# Creating the final pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('naive_bayes', logistic)
])
print('fitting')
# Fitting the pipeline on the training data
pipeline.fit(train,train['label'])
print('fitted')

# Predicting on the test data
test['pred'] = pipeline.predict(test)

# Evaluating the performance
print(accuracy_score(test['label'], test['pred']))

fitting


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


fitted
0.6311507936507936


# Long Words

In [45]:
categorical_features = [ 'ratio_long_words']


In [46]:
# Build a ColumnTransformer with FeatureUnion
text_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=tokenize, ngram_range=(1, 1), stop_words=stop))
])

num_pipeline = Pipeline([
    ('discretizer', KBinsDiscretizer(n_bins=3, encode='onehot-dense', strategy='kmeans'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, categorical_features),
    ('text', text_pipeline, text_feature)
])

# Define the Naive Bayes classifier
naive_bayes = MultinomialNB(alpha=0.01)

# Creating the final pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('naive_bayes', naive_bayes)
])
print('fitting')
# Fitting the pipeline on the training data
pipeline.fit(train,train['label'])
print('fitted')

# Predicting on the test data
test['pred'] = pipeline.predict(test)

# Evaluating the performance
print(accuracy_score(test['label'], test['pred']))


fitting
fitted
0.6111111111111112
