In [1]:
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.metrics import classification_report as class_report
from sklearn.linear_model import SGDClassifier as SGD
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.utils import resample

from sklearn import linear_model



In [2]:
"""
Takes a data frame and splits it into a training and testing set based on given proportion.

Parameters:
    df: The data frame to split.
    prop: The proportion of data to be used in the training set.
    
Return:
    train: Training data set
    test: Testing data set
"""
def basicSplit(df,prop, random_state):
    train = df.sample(frac = prop, random_state = random_state)
    test = df.loc[~df.index.isin(train.index)]
    return train,test



In [3]:
"""
Splits the data set into training and testing. Toxic comments are upsampled to give more weight
to toxic comment classification.

Parameters:
    df: The data frame to split.
    prop: The proportion of data to be used in the training set.
    random_state: Fixes randomization of sampling
    
Return:
    train: Training data set with equal proportions toxic and non toxic data
    test: Testing data set
"""

def splitDF_up_sample_toxic (df,prop, random_state = 0):
    
    train, test = basicSplit(df,prop, random_state = 0) #Split the data frame up
    
    trainToxic = train.loc[train["toxic"] == 1]#get all toxic comments from training set
    #print(trainToxic)
    trainNotToxic = train.loc[~train.index.isin(trainToxic.index)]#get all non toxic comments from training
    train = trainNotToxic.append(resample(trainToxic,n_samples= len(trainNotToxic.index),random_state = random_state))
    return train,test


In [5]:
"""
Gets the word vector counts of the data sets for the logistic model
Parameters
    df: The data frame to process (usually traing or testing data frame)
    vocab: Previous vocab to use (use training vocab for testing, use preselected words if words have been selected)
    lowercase: Whether or not to take the lowercase of the words
    
Returns: 
    TFVect: Vector of word counts for each comment
    vocab: List of words used
"""
def process(df, vocab = None, lowercase = True):
    
    cv = CountVectorizer(stop_words = {"english"},lowercase=lowercase,vocabulary = vocab)#get count vectorizer
    wordVector = cv.fit_transform(df["comment_text"]) #get count vector of words
    tfidf_transformer = TfidfTransformer() #make tfidf_transformer
    TFVect = tfidf_transformer.fit_transform(wordVector)#perform tfidf transform
    
    if vocab == None: #get the list of vocab words if preset vocab was not used
        vocab = cv.get_feature_names()
        
    return TFVect,vocab


In [6]:
"""
Prints out the count of misclassified toxic and not toxic comments.

Parameters:
    test: The test data set
    predict: The predictions made by the model
    tag: The type of comment classified (toxic, obscene, ect.)
"""

def CountPredict (test,predict,tag):
    
    test.index = [x for x in range(0,len(test.index))] #reset the test index to make iteration easier
    
    count1 = 0 #total non-toxic count
    count2 = 0 #total toxic count
    count1f = 0 #Count of non-toxic comments classified as toxic
    count2f = 0 #Count of toxic comments classified as non-toxic
    
    #count toxic and non toxic data
    for i in range(len(predict)):
        
        if test.loc[i,tag] == 0:
                count1 +=1 
                if predict[i] != test.loc[i,tag]:
                    count1f += 1
                    
        if test.loc[i,tag] ==1:
            count2 += 1
            if predict[i] != test.loc[i,tag]:
                count2f +=1
                
    print ("Proportion Non-Toxic Misclassified: %i/%i" %(count1f, count1), "= ", (count1f/count1))
    print ("Proportion Toxic Misclassified: %i/%i" %(count2f, count2), "= ", (count2f/count2))



In [7]:
"""
Find a good logistic model using a reduced number of words

Parameters:
    df: The data frame to build the model with.
    tag: The category to model (toxic, obscene, ect).
    
Return:
    bestFit: The best model found
    bestWords: The words used in the model
    test: The final test data set with predicted values appended
"""

def topWordslogReg_up_sample (df,tag):
    
    procAll, theVocab = process(df) #get the total vocab of data frame(used so that errors don't occur later)
    
    logistic = linear_model.LogisticRegression(penalty="l1") #make logistic modeler
   
    bestScore = 0 #The best score scene in the models generated
    bestFit = None #The model with the best score
    bestVocab = [] #The vocab of the best model
    
    #make 5 models and pick the one with the highest score
    for i in range(5):
        train,test= splitDF_up_sample_toxic(df,.7, random_state = i)
        procTrain,vocab1 = process(train,theVocab)
        procTest,vocab = process(test,vocab1)
        logfit = logistic.fit(procTrain, train[tag])
        score = logfit.score(procTest,test[tag])
        

        if score > bestScore:
            bestScore = score
            bestFit = logfit
            bestVocab = [x for x in vocab1]
    
    #coefficients should map to words in the vocab, Check if they are equal other wise 
    #words selectd may not be random
    if len(bestVocab) != len(bestFit.coef_[0]):
        print("Warning: Vocab and Coefficient length not equal, something went wrong")
        print("Length Best Vocab: ", len(bestVocab))
        print("Length Logistic Coeficients: ", len(bestFit.coef_[0]))

    print("Score of word selecting model: ",bestScore)
    
    #Select words that contribuited signifcantly to the prediction (abs(coefficent) > .5)
    bestWords = []
    wordTouple = []
    for i in range(len(bestFit.coef_[0])):
        if bestFit.coef_[0][i] >=.5 or bestFit.coef_[0][i] <= -.5:
            bestWords.append(bestVocab[i])
            
    
    print("Number of words: ", len(bestWords))
    theVocab = bestWords
    
    #Pick the best out of 10 models using only the selected words
    bestScore = 0
    for i in range(10):
        
        train,test = splitDF_up_sample_toxic(df,.7, random_state = i+5)
        procTrain,vocab1 = process(train,theVocab)
        procTest,vocab = process(test,vocab1)
        logfit = logistic.fit(procTrain, train[tag])
        score = logfit.score(procTest,test[tag])
        if score > bestScore or i == 0: #first
            bestScore = score
            bestFit = logfit
            
  
    #run the prediction on the testing data of the last iteration of the loop for convience
    print ("Score of Final Model: ", bestScore)
    predict = bestFit.predict(procTest)
    CountPredict(test,predict,tag)
    
    #add the predected values to the test data frame and return it
    newRow = "predicted_" + tag
    test[newRow] = predict
    
    return bestFit, bestWords, test


# In[41]:

In [9]:
DF_file = "C:\\Users\\alexh\\Downloads\\toxic_comments_data.csv" #May need to change path to find fild
DF = pd.read_csv(DF_file)
trainDF, testDF = basicSplit(DF,.7, random_state = 0)
trainDF.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
74251,c6a29bad26183dcf,"""\nI haven't paraphrased you at all, Gary. Yo...",0,0,0,0,0,0
131406,befd36e7acca9e56,I BLOCKED REVERS! I BLOCKED REVERS! I BLOCKED ...,1,0,0,0,0,0
120969,8734c26db56d1763,I'm sorry. I'd like to unreservedly retract my...,0,0,0,0,1,0
121827,8bcf03120412d869,I don't know if this is exactly like the Press...,0,0,0,0,0,0
4771,0ca7b705720d6956,"Thank you all, we'll all improve the Wikipedia...",0,0,0,0,0,0


In [10]:
testDF.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
10,0005300084f90edc,"""\nFair use rationale for Image:Wonju.jpg\n\nT...",0,0,0,0,0,0
13,0006f16e4e9f292e,Before you start throwing accusations and warn...,0,0,0,0,0,0
21,000bfd0867774845,"""\nGood to know. About me, yeah, I'm studying ...",0,0,0,0,0,0
27,000ffab30195c5e1,"Yes, because the mother of the child in the ca...",0,0,0,0,0,0


In [11]:
#Create model from training data
train_fit, bestWords, train_test_result = topWordslogReg_up_sample(trainDF,"toxic")


Score of word selecting model:  0.9389734407639511
Number of words:  2727
Score of Final Model:  0.9372127723067741
Proportion Non-Toxic Misclassified: 1767/30334 =  0.05825146700072526
Proportion Toxic Misclassified: 387/3176 =  0.12185138539042821


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [12]:
#Test model on test data
procTest,vocab = process(testDF,bestWords)
finalPredict = train_fit.predict(procTest)
CountPredict(testDF,finalPredict,"toxic")


Proportion Non-Toxic Misclassified: 2510/43290 =  0.05798105798105798
Proportion Toxic Misclassified: 580/4581 =  0.12660991049989084
