In [1]:
import re, string
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC

## Load raw data

In [2]:
def clean_text(text):
    text = text.lower()
    text = re.findall(r'\b[a-z]+\b', text)
    return ' '.join(text)

In [3]:
training_data = pd.read_csv('./train.csv')
training_data.head()

Unnamed: 0,Insult,Date,Comment
0,1,20120618192155Z,"""You fuck your dad."""
1,0,20120528192215Z,"""i really don't understand your point.\xa0 It ..."
2,0,,"""A\\xc2\\xa0majority of Canadians can and has ..."
3,0,,"""listen if you dont wanna get married to a man..."
4,0,20120619094753Z,"""C\xe1c b\u1ea1n xu\u1ed1ng \u0111\u01b0\u1edd..."


### Cleaning data

In [4]:
training_data['cleaned_comment'] = training_data['Comment'].apply(clean_text)

In [5]:
training_data.head()

Unnamed: 0,Insult,Date,Comment,cleaned_comment
0,1,20120618192155Z,"""You fuck your dad.""",you fuck your dad
1,0,20120528192215Z,"""i really don't understand your point.\xa0 It ...",i really don t understand your point it seems ...
2,0,,"""A\\xc2\\xa0majority of Canadians can and has ...",a of canadians can and has been wrong before n...
3,0,,"""listen if you dont wanna get married to a man...",listen if you dont wanna get married to a man ...
4,0,20120619094753Z,"""C\xe1c b\u1ea1n xu\u1ed1ng \u0111\u01b0\u1edd...",c b xu bi t xecnh c ho kh nc ng d ng cu xed ch...


In [6]:
training_data.shape

(3947, 4)

In [7]:
training_data['Insult'].value_counts()

0    2898
1    1049
Name: Insult, dtype: int64

In [8]:
# How does it look the insults
training_data.query('Insult == 1')

Unnamed: 0,Insult,Date,Comment,cleaned_comment
0,1,20120618192155Z,"""You fuck your dad.""",you fuck your dad
7,1,,"""shut the fuck up. you and the rest of your fa...",shut the fuck up you and the rest of your fagg...
8,1,20120502173553Z,"""Either you are fake or extremely stupid...may...",either you are fake or extremely stupid maybe ...
9,1,20120620160512Z,"""That you are an idiot who understands neither...",that you are an idiot who understands neither ...
15,1,20120611090207Z,"""FOR SOME REASON U SOUND RETARDED. LOL. DAMN. ...",for some reason u sound retarded lol damn wher...
...,...,...,...,...
3929,1,20120619022522Z,"""Why don't you shut your stupid mouth, pennyla...",why don t you shut your stupid mouth pennylane...
3931,1,20120618222326Z,"""You goofy Repub thugs don't have a clue, not ...",you goofy repub thugs don t have a clue not th...
3934,1,20120610083604Z,"""True, maybe he's deliberately being an assh*l...",true maybe he s deliberately being an assh le ...
3935,1,20120515160649Z,"""JoelWeltman, you look like a pedophile from y...",joelweltman you look like a pedophile from you...


## Make some features

Formulate the problem into mathematical terms. The function `CountVectorizer` transform all the words of each tweet into a vector. The lenght of this vector is equal to the total number of unique representative words in the dataset. The function finaly returns a matrix with all these vectors.

In [9]:
count_vectorizer = CountVectorizer(analyzer='word', ngram_range=(1,3), stop_words='english', max_features=5000)
count_vectorizer.fit(training_data['cleaned_comment'])

CountVectorizer(max_features=5000, ngram_range=(1, 3), stop_words='english')

In [10]:
#count_vectorizer.vocabulary_

In [11]:
# Featurizing tags
X = count_vectorizer.transform(training_data['cleaned_comment'])
y = training_data['Insult']

In [12]:
CountVectorizer?

In [13]:
# Compress representation of the matrix
X

<3947x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 45934 stored elements in Compressed Sparse Row format>

## Cross-validate

Now the data is going to be divided in train data and test data.

### Split data

In [14]:
def split_data (X, y, p=0.75):
    """ 
    X = Feature matrix
    y = vector
    p = percentage of data in training data
    """
    mask = np.array([ bool(np.random.binomial(1,p)) for _ in range(X.shape[0])])
    X_train = X[mask]
    y_train = y[mask]
    X_test = X[~mask]
    y_test = y[~mask]
    
    return X_train, y_train, X_test, y_test

In [15]:
# Explanation of previus code
p = 0.2
mask = np.array([ bool(np.random.binomial(1,p)) for _ in range(X.shape[0])])
pd.Series(mask).value_counts() / X.shape[0]

False    0.811249
True     0.188751
dtype: float64

In [16]:
X_train, y_train, X_test, y_test = split_data(X,y)

### Fit a model on training data

In [17]:
from sklearn.linear_model import LogisticRegression

# Classifier ~ Classifier
clf = LogisticRegression(verbose=1)
clf.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished


LogisticRegression(verbose=1)

### Validate model on test data

In [18]:
from sklearn.metrics import accuracy_score

predictions = clf.predict(X_test)
validation_score = accuracy_score(y_test, predictions)
print('Validation score: ', validation_score)

Validation score:  0.8352570828961176


## Remeber, everything is a hyper-parameter

It is important to automatize the process of testing models and hyperparameters, in that way you can look faster which model works fine and focus in it.

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [29]:
class PredictionPipeline:
    
    def __init__(self, ngram_range, vectorizer_class, model_class, training_data):
        self.ngram_range = ngram_range
        self.vectorizer_class = vectorizer_class
        self.model_class = model_class
        self.training_data = training_data
        self.vectorizer = None
        self.X = None
        self.y = None
        self.model = None
        self.validation_score = None
        
    def run(self):
        self._fit_vectorizer()
        self._featurize_text()
        self._split_train_and_validation_sets()
        self._fit_model_on_training_data()
        self._validate_model_on_validation_set()
        
        print(
        """
        Vectorizer Class: {vectorizer_class}\n
        N-gram Range: {ngram_range}\n
        Model Class: {model_class}\n
        Validation Score: {validation_score}
            
        """.format(

            vectorizer_class=repr(self.vectorizer_class.__name__), 
            ngram_range=self.ngram_range, 
            model_class=repr(self.model_class.__name__), 
            validation_score=round(self.validation_score, 4)

                )
            )
        
        
    def _fit_vectorizer(self):
        self.vectorizer = vectorizer_class(analyzer='word', ngram_range=ngram_range,
                                           stop_words='english', max_features=50000)
        self.vectorizer.fit(self.training_data['cleaned_comment'])
       
    
    def _featurize_text(self):
        self.X = self.vectorizer.transform(self.training_data['cleaned_comment'])
        self.y = self.training_data['Insult']
        
    
    def _split_train_and_validation_sets(self):
        self.X_train, self.X_validation, self.y_train, self.y_validation = train_test_split(
        self.X, self.y, test_size=0.25, random_state=123)
       
    
    def _fit_model_on_training_data(self):
        self.model = self.model_class()
        self.model.fit(self.X_train, self.y_train)
        
    def _validate_model_on_validation_set(self):
        predictions = self.model.predict(self.X_validation)
        self.validation_score = accuracy_score(self.y_validation, predictions)
        
    

In [30]:
results = {}

for ngram_range in [(1,1), (1,2), (1,3), (1,4)]:
    for vectorizer_class in [CountVectorizer, TfidfVectorizer]:
        for model_class in [LogisticRegression, LinearSVC, RandomForestClassifier]:
            
            # Run prediction pipeline
            prediction_pipeline = PredictionPipeline(
                ngram_range = ngram_range,
                vectorizer_class = vectorizer_class,
                model_class = model_class,
                training_data =  training_data
            )
            prediction_pipeline.run()
            
            results[str(prediction_pipeline.validation_score)] = {
                'vectorizer_class': prediction_pipeline.vectorizer_class,
                'ngram_range': prediction_pipeline.ngram_range,
                'model_class': prediction_pipeline.model_class
            }
            


        Vectorizer Class: 'CountVectorizer'

        N-gram Range: (1, 1)

        Model Class: 'LogisticRegression'

        Validation Score: 0.8227
            
        

        Vectorizer Class: 'CountVectorizer'

        N-gram Range: (1, 1)

        Model Class: 'LinearSVC'

        Validation Score: 0.8166
            
        

        Vectorizer Class: 'CountVectorizer'

        N-gram Range: (1, 1)

        Model Class: 'RandomForestClassifier'

        Validation Score: 0.8318
            
        

        Vectorizer Class: 'TfidfVectorizer'

        N-gram Range: (1, 1)

        Model Class: 'LogisticRegression'

        Validation Score: 0.7923
            
        

        Vectorizer Class: 'TfidfVectorizer'

        N-gram Range: (1, 1)

        Model Class: 'LinearSVC'

        Validation Score: 0.8278
            
        

        Vectorizer Class: 'TfidfVectorizer'

        N-gram Range: (1, 1)

        Model Class: 'RandomForestClassifier'

        Validation Sc

In [33]:
top_3_scores = sorted(results.keys(), reverse=True)[:3]

for score in top_3_scores:
    print(f'Score: {score}\nParameters: {results[score]}')

Score: 0.8318135764944276
Parameters: {'vectorizer_class': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'ngram_range': (1, 1), 'model_class': <class 'sklearn.ensemble._forest.RandomForestClassifier'>}
Score: 0.8308004052684904
Parameters: {'vectorizer_class': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'ngram_range': (1, 4), 'model_class': <class 'sklearn.linear_model._logistic.LogisticRegression'>}
Score: 0.8277608915906788
Parameters: {'vectorizer_class': <class 'sklearn.feature_extraction.text.TfidfVectorizer'>, 'ngram_range': (1, 1), 'model_class': <class 'sklearn.svm._classes.LinearSVC'>}


In [37]:
top_3_scores

['0.8318135764944276', '0.8308004052684904', '0.8277608915906788']

## Train final model

In [38]:
top_score_key = top_3_scores[0]

In [45]:
vectorizer_class = results[top_score_key]['vectorizer_class']
ngram_range = results[top_score_key]['ngram_range']
model_class = results[top_score_key]['model_class']

# Fit vectorizer
vectorizer = vectorizer_class(analyzer='word', ngram_range=ngram_range, stop_words='english', max_features=50000)
vectorizer.fit(training_data['cleaned_comment']) #Generates the vector with all the vocabulary

CountVectorizer(max_features=50000, stop_words='english')

In [47]:
# transform text
X = vectorizer.transform(training_data['cleaned_comment'])
y = training_data['Insult']

In [49]:
# Fit model on training data
model = model_class()
model.fit(X,y)

RandomForestClassifier()

## Run it live

In [64]:
input_string = input('Please enter a string: ')
input_string = clean_text(input_string)
X_test = vectorizer.transform([input_string])

prediction = model.predict(X_test)[0]

print('Insult: ', bool(prediction))

Please enter a string: you are a dumb
Insult:  True
