In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import confusion_matrix, roc_auc_score, recall_score, precision_score
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [2]:
nltk.download('stopwords')
en_stopwords = set(stopwords.words("english")) 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Erik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
import pandas as pd
import numpy as np
df = pd.read_csv('train.csv')
print(df.head, df.shape)
df = df[0:10000]
#df.drop(np.arange(10000, 20376))
print("----------------------------------------------")
print(df.head, df.shape)


<bound method NDFrame.head of           id                                              title  \
0          0  House Dem Aide: We Didn’t Even See Comey’s Let...   
1          1  FLYNN: Hillary Clinton, Big Woman on Campus - ...   
2          2                  Why the Truth Might Get You Fired   
3          3  15 Civilians Killed In Single US Airstrike Hav...   
4          4  Iranian woman jailed for fictional unpublished...   
...      ...                                                ...   
20795  20795  Rapper T.I.: Trump a ’Poster Child For White S...   
20796  20796  N.F.L. Playoffs: Schedule, Matchups and Odds -...   
20797  20797  Macy’s Is Said to Receive Takeover Approach by...   
20798  20798  NATO, Russia To Hold Parallel Exercises In Bal...   
20799  20799                          What Keeps the F-35 Alive   

                                          author  \
0                                  Darrell Lucus   
1                                Daniel J. Flynn   
2        

In [30]:

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import itertools
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.linear_model import PassiveAggressiveClassifier
import os


#replacing Body nan with Headline
for i in range(0,df.shape[0]-1):
    if(df.text.isnull()[i]):
        df.text[i] = df.title[i]
        
y = df.label
X = df.text

#train_test separation
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)

#Applying tfidf to the data set
tfidf_vect = TfidfVectorizer(stop_words = 'english')
tfidf_train = tfidf_vect.fit_transform(X_train)
tfidf_test = tfidf_vect.transform(X_test)
tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vect.get_feature_names())

#Applying Naive Bayes
clf = MultinomialNB() 
clf.fit(tfidf_train, y_train)                       # Fit Naive Bayes classifier according to X, y
pred = clf.predict(tfidf_test)                     # Perform classification on an array of test vectors X.
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred)
print(cm)

#Applying Passive Aggressive classifier
linear_clf = PassiveAggressiveClassifier()
linear_clf.fit(tfidf_train, y_train)
pred = linear_clf.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred)
print(cm)

accuracy:   0.866
[[996   6]
 [262 736]]
accuracy:   0.952
[[945  57]
 [ 39 959]]


In [3]:
def tokenize(text): 
    """
    Separate text into tokens
    """
    tknzr = TweetTokenizer()
    return tknzr.tokenize(text)

def get_vectorizer():
    """ 
    Create and return a CountVectorizer
    
    Hints:
        Read docs on CountVectorizer to set arguments (https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) 
        Pass the above tokenize function as the tokenizer.
        Use en_stopwords variable above as stopwords
        
        Play with different parameters.
        min_df argument can help with generalizability
    """
    
    # YOUR CODE HERE
    vectorizer = CountVectorizer(tokenizer=tokenize, stop_words=en_stopwords, min_df=0.1)
    # vectorizer = CountVectorizer(tokenizer=tokenize, stop_words=en_stopwords)
    return vectorizer

def get_kfolds():
    """ 
    Split dataset into 5 splits using StratifiedKFold 
    
    Hint:
    Remember to shuffle
    """
    # YOUR CODE HERE
    kfolds = StratifiedKFold(n_splits=5, random_state=None, shuffle=True)
    return kfolds


vectorizer = get_vectorizer()
kfolds = get_kfolds()

In [10]:
def get_pipeline(**kwargs):
    """
        Define pipeline using make_pipeline (see sklearn docs) with vectorizer and SVM Classifier. 
        
        The SVM Classifer should take in all kwargs passed (passing kwargs can be achieved with fn(**kwargs))
        You should use balanced class weights for SVM.
    """
    
    # YOUR CODE HERE
    '''pipeline_svm = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', SVC(**kwargs))])'''
    pipeline_svm = make_pipeline(get_vectorizer(), SVC(**kwargs, class_weight='balanced'))
    #print(pipeline_svm.get_params().keys())
    #pipeline_svm = make_pipeline(get_vectorizer(), SVC(**kwargs))
    
    return pipeline_svm

def get_course_params():
    """
        Create the grid search parameters defined above for course grid search. 
        Returns a list of dictionaries to be passed as argument to GridSearchCV below
    """
    
    # YOUR CODE HERE    
    param_grid = [{'svc__kernel': ['rbf'], 'svc__gamma': [0.1, 0.5, 1.0]},
                    {'svc__kernel': ['linear'], 'svc__C': [0.01, 1.0, 10.0]},
                    {'svc__kernel': ['poly'], 'svc__degree': [2, 3], 'svc__gamma': [0.1, 0.5, 1.0]}]

    return param_grid

def get_grid_svm():
    """
        Create GridSearchCV with pipeline and the grid search parameters given above using "accuracy" for scoring.
    """ 
     
    param_grid = get_course_params()
    pipeline_svm = get_pipeline()
    
    # YOUR CODE HERE
    grid_svm = GridSearchCV(
        get_pipeline(), get_course_params(), scoring='accuracy')
    
    return grid_svm
    
    

grid_svm = get_grid_svm()
# For debugging purposes, it makes sense to use a smaller set of training set to speed up the grid search progress
# refit is not necessary since by default refit is true in GridSearchCV, 
# but we did this to show that you need to use the best parameter to fit the whole training set
_ = grid_svm.fit(train, test)

TypeError: '<' not supported between instances of 'str' and 'int'

In [5]:
best_params = None
best_score = None

# Store best parameters and CV score from grid search for reporting into the variables above
# YOUR CODE HERE
best_params = grid_svm.best_params_
best_score = grid_svm.best_score_

# Report best parameters and CV score from grid search
print(f'best params: {best_params} | best cv score: {best_score}')

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [6]:
def get_params_fine_scale():
    """
        Set param_grid to a dictionary containing parameter values for fine scale search.
        Return value is passed as argument to RandomizedSearchCV below
    """ 
    # YOUR CODE HERE
    param_grid = [{'svc__kernel': ['linear'], 'svc__C': [0.01, 0.02, 0.03, 0.04, 0.05]}]
    
    return param_grid

def get_random_svm():
    """
        Create randomized parameter search over fine scale grid;
        Do NOT change the value of n_iter in the submitted version of your notebook.
    """ 
    n_iter = 5
    pipeline_svm = get_pipeline()
    param_grid = get_params_fine_scale()

    # important
    random_svm = RandomizedSearchCV(
        pipeline_svm,
        param_grid,
        n_iter=n_iter,
        cv = kfolds,
        scoring="accuracy",
        verbose=1,   
        n_jobs=-1
    )
    return random_svm

random_svm = get_random_svm()
# refit is not necessary since by default refit is true in RandomSearchCV, 
# but we did this to show that you need to use the best parameter to fit the whole training set
_ = random_svm.fit(X_train, y_train)

NameError: name 'X_train' is not defined

In [7]:
best_params = None
best_score = None

# Store best parameters and score from random search for reporting into the variables above 
# YOUR CODE HERE
best_params = random_svm.best_params_
best_score = random_svm.best_score_

# Report best parameters and score from random search
print(f'best params: {best_params} | best cv score: {best_score}')

AttributeError: 'RandomizedSearchCV' object has no attribute 'best_params_'

In [8]:
def report_results(model, X, y):
    # Reports various model metrics.  
    pred = model.predict(X)        
    acc = accuracy_score(y, pred)
    f1 = f1_score(y, pred)
    prec = precision_score(y, pred)
    rec = recall_score(y, pred)
    result = {'f1': f1, 'acc': acc, 'precision': prec, 'recall': rec}
    return result

In [None]:
# get test metrics.
report_results(random_svm.best_estimator_, X_test, y_test)