# Classifying Tweets for Sentiment Analysis
Reference: https://medium.com/vickdata/detecting-hate-speech-in-tweets-natural-language-processing-in-python-for-beginners-4e591952223

In [0]:
import pandas as pd

In [2]:
train = pd.read_csv('https://datahack-prod.s3.amazonaws.com/train_file/train_E6oV3lV.csv')
print("Training Set:"% train.columns, train.shape, len(train))
test = pd.read_csv('https://datahack-prod.s3.amazonaws.com/test_file/test_tweets_anuFYb8.csv')
print("Test Set:"% test.columns, test.shape, len(test))

Training Set: (31962, 3) 31962
Test Set: (17197, 2) 17197


## Text Cleaning

In [0]:
import re
def  clean_text(df, text_field):
    df[text_field] = df[text_field].str.lower()
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))  
    return df
test_clean = clean_text(test, "tweet")
train_clean = clean_text(train, "tweet")

## Handling imbalanced classes

In [4]:
# Upsampling
from sklearn.utils import resample
train_majority = train_clean[train_clean.label==0]
train_minority = train_clean[train_clean.label==1]
train_minority_upsampled = resample(train_minority, 
                                 replace=True,    
                                 n_samples=len(train_majority),   
                                 random_state=123)
train_upsampled = pd.concat([train_minority_upsampled, train_majority])
train_upsampled['label'].value_counts()

1    29720
0    29720
Name: label, dtype: int64

In [0]:
# Downsampling
# train_majority = train_clean[train_clean.label==0]
# train_minority = train_clean[train_clean.label==1]
 
# train_majority_downsampled = resample(train_majority, 
#                                  replace=True,  
#                                  n_samples=len(train_minority),   
#                                  random_state=123)
# train_downsampled = pd.concat([train_majority_downsampled, train_minority])
# train_downsampled['label'].value_counts()

I tried to use both methods and I can get a better result from downsampling so I go for it.

## CountVectoriser
A BoW model splits the words in a piece of text into tokens disregarding grammar and word order. The model also counts the frequency in which a word occurs in the text, and assigns a weight proportional to this frequency. The output is a matrix of term frequencies where each row represents the text and each column a word in the vocabulary.

## TfidfTransformer
CountVectoriser accomplishes the first two, splitting the words into tokens and counting the frequency. We can use another scikit-learn function called TfidfTransformer to apply the frequency weighting.

## Let's training the model

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
pipeline_sgd = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('nb', SGDClassifier()),
])

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_upsampled['tweet'],train_upsampled['label'],random_state = 0)

In [8]:
model = pipeline_sgd.fit(X_train, y_train)
y_predict = model.predict(X_test)
from sklearn.metrics import f1_score
f1_score(y_test, y_predict)

0.9694666666666667