In [7]:
# Load the dataset
import pandas as pd

df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [8]:
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [9]:
df = df.dropna(subset=['text'])
df['all_text'] = df['title'] + ' ' + df['text']
df = df.drop(columns=['id', 'author', 'title', 'text'])
df.isnull().sum()

label         0
all_text    558
dtype: int64

In [10]:
# Drop empty rows
df = df.dropna(subset=['all_text'])
df.isnull().sum()

label       0
all_text    0
dtype: int64

In [11]:
# Check class distribution
df['label'].value_counts()

0    10387
1     9816
Name: label, dtype: int64

In [12]:
# Preprocess the data
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove non-letter characters
    text = ''.join(c for c in text if c.isalpha() or c.isspace())
    
    # Convert to lowercase
    text = text.lower()
    
    # Tokenize, alternative nltk.word_tokenize(text)
    tokens = text.split()
    
    # Remove stopwords and perform stemming
    stop_words = set(stopwords.words('english'))
    ps = PorterStemmer()
    processed_tokens = [ps.stem(word) for word in tokens if not word in stop_words]
    processed_tokens = ' '.join(processed_tokens)
    
    return processed_tokens

X = df['all_text']
y = df['label']
print('Before preprocessing:')
print(X.head())

X = X.apply(preprocess_text)
print('After preprocessing:')
print(X.head())

Before preprocessing:
0    House Dem Aide: We Didn’t Even See Comey’s Let...
1    FLYNN: Hillary Clinton, Big Woman on Campus - ...
2    Why the Truth Might Get You Fired Why the Trut...
3    15 Civilians Killed In Single US Airstrike Hav...
4    Iranian woman jailed for fictional unpublished...
Name: all_text, dtype: object
After preprocessing:
0    hous dem aid didnt even see comey letter jason...
1    flynn hillari clinton big woman campu breitbar...
2    truth might get fire truth might get fire octo...
3    civilian kill singl us airstrik identifi video...
4    iranian woman jail fiction unpublish stori wom...
Name: all_text, dtype: object


In [13]:
# Split the data into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Create feature vectors using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)
print(X_train)

  (0, 55387)	0.07173242381361301
  (0, 110165)	0.02699437729528575
  (0, 37604)	0.02249940634569755
  (0, 106139)	0.04211473998353424
  (0, 13877)	0.030489427496529307
  (0, 21863)	0.051460707432091565
  (0, 11683)	0.06930575625257465
  (0, 54702)	0.03196610940834586
  (0, 21077)	0.04162615888040291
  (0, 60375)	0.09875557695308286
  (0, 85240)	0.04521948827461712
  (0, 60209)	0.03330387632948736
  (0, 106772)	0.023161878629124222
  (0, 35209)	0.032701704408892134
  (0, 89036)	0.040815366752584556
  (0, 23214)	0.03564488824094397
  (0, 52384)	0.030037324283202847
  (0, 112583)	0.020104921246336477
  (0, 62254)	0.025899067970918507
  (0, 106940)	0.02850465330092847
  (0, 43200)	0.03586365346946216
  (0, 13079)	0.06086397061750317
  (0, 117828)	0.02740491416162256
  (0, 16157)	0.03215015448499058
  (0, 20707)	0.03508215678263701
  :	:
  (16161, 95794)	0.03999302533780579
  (16161, 51452)	0.05517254394847048
  (16161, 84916)	0.037328140751521625
  (16161, 642)	0.02690648738008708
  (16161

In [15]:
print(X_test)

  (0, 119829)	0.02627291248951931
  (0, 118403)	0.025815869517282642
  (0, 118254)	0.28181362974230584
  (0, 118213)	0.015434901440427855
  (0, 117828)	0.02042838850107635
  (0, 116952)	0.044256212340370554
  (0, 116498)	0.016941422454450454
  (0, 116319)	0.01757079526693867
  (0, 116206)	0.028625458599034287
  (0, 116107)	0.09661170675656593
  (0, 115961)	0.03793188292184167
  (0, 115693)	0.11296839639653106
  (0, 113275)	0.11030360371256909
  (0, 112760)	0.05686472841346985
  (0, 112583)	0.029973539751411413
  (0, 112479)	0.03121783535696492
  (0, 112211)	0.05144910620149095
  (0, 110186)	0.0305218093881764
  (0, 109942)	0.03895073606858267
  (0, 109535)	0.03088641559364885
  (0, 108873)	0.46562745122257093
  (0, 107845)	0.05494933601013468
  (0, 107811)	0.025127030027319506
  (0, 107569)	0.07723118327715343
  (0, 107538)	0.07237361583192545
  :	:
  (4040, 6975)	0.01870452375035868
  (4040, 6386)	0.049176994007298476
  (4040, 6323)	0.020156096339345116
  (4040, 6255)	0.01619823719127

In [16]:
from sklearn.metrics import accuracy_score

In [17]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train, y_train)

# Predicting on the test data
y_pred_lr = lr.predict(X_test)

print(accuracy_score(y_test, y_pred_lr))

0.9532293986636972


In [18]:
# Naive Bayes classifier
from sklearn.naive_bayes import MultinomialNB
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

# Predicting on the test data
y_pred_nb = nb_classifier.predict(X_test)

print(accuracy_score(y_test, y_pred_nb))

0.8250433061123484


In [19]:
from sklearn.ensemble import RandomForestClassifier

# Random forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Make predictions on test set
y_pred_rf = rf.predict(X_test)
print(accuracy_score(y_test, y_pred_rf))

0.9267508042563722


In [20]:
# Train an SVM classifier
from sklearn.svm import SVC

svm = SVC(kernel='linear', C=1, gamma='auto')
svm.fit(X_train, y_train)

# Predicting on the test set
y_pred_svm = svm.predict(X_test)

print(accuracy_score(y_test, y_pred_svm))

0.9673348181143281


In [21]:
# Train the Gradient Boosting classifier
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(n_estimators=100, max_depth=5, random_state=42)
gb.fit(X_train, y_train)

# Make predictions on the test set
y_pred_gb = gb.predict(X_test)

print(accuracy_score(y_test, y_pred_gb))

0.9638703291264539


In [22]:
# K-Nearest Neighbors (KNN)
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Predicting on the test data
y_pred_knn = knn.predict(X_test)

print(accuracy_score(y_test, y_pred_knn))

0.8604305864884929


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Define the pipeline
pipeline = Pipeline([
    ('clf', None)
])

# Define the parameters to search over
parameters = [
    {
        'clf': [LogisticRegression(max_iter=3000)],
        'clf__C': [0.1, 1, 10],
        'clf__solver': ['lbfgs', 'liblinear'],
    },
    {
        'clf': [MultinomialNB()],
        'clf__alpha': [0.1, 1, 10],
    },
    {
        'clf': [RandomForestClassifier()],
        'clf__n_estimators': [10, 100, 500],
    },
    {
        'clf': [SVC()],
        'clf__C': [0.1, 1, 10],
        'clf__kernel': ['linear', 'rbf'],
    },
    {
        'clf': [GradientBoostingClassifier()],
        'clf__n_estimators': [100, 200],
        'clf__learning_rate': [0.01, 0.1, 1],
    },
    {
        'clf': [KNeighborsClassifier()],
        'clf__n_neighbors': [3, 5, 10],
    }
]

# Define the grid search
grid_search = GridSearchCV(pipeline, parameters, cv=5, scoring='accuracy', verbose=1)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Print the best model's parameters and score
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best parameters:  {'clf': SVC(C=1, kernel='linear'), 'clf__C': 1, 'clf__kernel': 'linear'}
Best score:  0.9630613138641424
