In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split #split data into train and test sets
from sklearn.feature_extraction.text import CountVectorizer #convert text comment into a numeric vector
from sklearn.feature_extraction.text import TfidfTransformer #use TF IDF transformer to change text vector created by count vectorizer
from sklearn.svm import SVC# Support Vector Machine
from sklearn.pipeline import Pipeline #pipeline to implement steps in series
from gensim import parsing # To stem data

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory


#Read csv into a dataframe
df = pd.read_csv("./train.csv")
#print first 5 rows of dataset
print(df.head())

# Any results you write to the current directory are saved as output.

   Unnamed: 0                                              title  \
0           0  Obama’s Delusion Continues In Vapid Address To...   
1           1  Huge blast in China's Ningbo city kills at lea...   
2           2  Trump applauds senators for new healthcare ref...   
3           3  UAW chief says Clinton told him she would rene...   
4           4  The Las Vegas Mass Shooting – More to the Stor...   

                                                text       subject  \
0  Obama addressed the Nation In a nothing burger...      politics   
1  BEIJING (Reuters) - A powerful explosion in a ...     worldnews   
2  WASHINGTON (Reuters) - President Donald Trump ...  politicsNews   
3  (Reuters) - United Auto Workers President Denn...  politicsNews   
4  Shawn Helton 21st Century WireAlthough many ar...       US_News   

                  date  label  
0          Dec 6, 2015      0  
1   November 26, 2017       1  
2  September 13, 2017       1  
3       July 26, 2016       1  
4     Octo



In [2]:

from sklearn.metrics import classification_report, confusion_matrix

#for grouping similar words such as 'trying" and "try" are same words
def parse(s):
    parsing.stem_text(s)
    return s

#applying parsing to comments.
for i in range(0,len(df)):
    df.iloc[i,2]=parse(df.iloc[i,2])
    
#Seperate data into feature and results
X, y = df['text'].tolist(), df['label'].tolist()

#Split data in train and test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)



#Use pipeline to carry out steps in sequence with a single object
#SVM's rbf kernel gives highest accuracy in this classification problem.
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SVC(kernel='rbf'))])

#train model
text_clf.fit(X_train, y_train)

ytest = np.array(y_test)

# #predict class form test data 
# predicted = text_clf.predict(X_test)


# confusion matrix and classification report(precision, recall, F1-score)
print(classification_report(ytest, text_clf.predict(X_test)))
print(confusion_matrix(ytest, text_clf.predict(X_test)))



              precision    recall  f1-score   support

           0       1.00      0.33      0.50        12
           1       0.62      1.00      0.76        13

    accuracy                           0.68        25
   macro avg       0.81      0.67      0.63        25
weighted avg       0.80      0.68      0.64        25

[[ 4  8]
 [ 0 13]]


# 2 - Word Embeddings

In [1]:
import sys

# !pip install tensorflow

from sklearn.model_selection import train_test_split
import tensorflow as tf

In [2]:
# !pip install tensorflow_hub
# !pip install tensorflow_text

import numpy as np
import tensorflow_hub as hub
import tensorflow_text

In [3]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")

In [4]:
import re
import pandas as pd

In [5]:
import string

# !pip install preprocessor
# !pip install tweet-preprocessor
# !pip install spacy

import preprocessor as p
from spacy.lang.en import stop_words as spacy_stopwords  # we use spacy's list of stop words to clean our data

# p.set_options(p.OPT.URL, p.OPT.MENTION)  # removes mentions and URLs only
stop_words = spacy_stopwords.STOP_WORDS
punctuations = string.punctuation


def clean(text):
    text = p.clean(text)
    text = re.sub(r'\W+', ' ', text)  # remove non-alphanumeric characters
    # replace numbers with the word 'number'
    text = re.sub(r"\d+", "number", text)
    # don't consider sentenced with less than 3 words (i.e. assumed noise)
    if len(text.strip().split()) < 3:
        return None
    text = text.lower()  # lower case everything
    
    return text.strip() # remove redundant spaces

In [6]:
df = pd.read_csv("train.csv")

In [7]:
msg_train, msg_test, y_train, y_test = train_test_split(df.text, df.label)

In [None]:
X_test = embed(msg_test)
X_test.shape

In [54]:
splits = np.array_split(msg_train, 5)
l = list()
for split in splits:
    l.append(embed(split))

In [55]:
X_train = tf.concat(l, axis=0)
del l
X_train.shape

TensorShape([75, 512])

In [64]:
from sklearn.svm import SVC
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

In [57]:
# initialize the model and assign weights to each class
clf = SVC()
# train the model
clf.fit(X_train, y_train)
# use the model to predict the testing instances
y_pred = clf.predict(np.array(X_test))
# generate the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.62      0.62      0.62         8
           1       0.82      0.82      0.82        17

    accuracy                           0.76        25
   macro avg       0.72      0.72      0.72        25
weighted avg       0.76      0.76      0.76        25



In [60]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.fit(X_train, y_train)

y_pred = clf.predict(np.array(X_test))
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.62      0.67         8
           1       0.83      0.88      0.86        17

    accuracy                           0.80        25
   macro avg       0.77      0.75      0.76        25
weighted avg       0.80      0.80      0.80        25



In [66]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train, y_train)


y_pred = clf.predict(np.array(X_test))
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.67      0.75      0.71         8
           1       0.88      0.82      0.85        17

    accuracy                           0.80        25
   macro avg       0.77      0.79      0.78        25
weighted avg       0.81      0.80      0.80        25



In [74]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import preprocessing


clf = MultinomialNB()

normalized = preprocessing.normalize(y_train)

clf.fit(X_train, y_train)

y_pred = clf.predict(np.array(X_test))
print(classification_report(y_test, y_pred))

ValueError: Expected 2D array, got 1D array instead:
array=[1. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 1. 1. 0. 0. 1. 1. 1. 1. 0. 1. 1. 0.
 1. 1. 0. 0. 1. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 1. 1.
 1. 1. 1. 0. 1. 0. 1. 0. 0. 1. 0. 0. 1. 1. 0. 1. 0. 1. 0. 1. 0. 1. 0. 0.
 0. 1. 0.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.