In [2]:
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.metrics import classification_report as class_report
from sklearn.linear_model import SGDClassifier as SGD
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

from sklearn import linear_model


In [3]:
#Takes a data frame and splits it into a training and testing set
#df: The Data frame to split
#prop: The proportion of the data frame you want for training data
#Returns, the training and testing sets
def basicSplit(df,prop):
    train = df.sample(frac = prop)
    test = df.loc[~df.index.isin(train.index)]
    return train,test

In [4]:
#Splits the data set into training and testing. Training data is subseted to have an equal amount of toxic and non toxic coments
#df: The Data Frame to split
#prop: the proportion of data to be in the training set
#Returns: A training set with equal proportions of toxic and nontoxic comments, and the testing set
def splitDF(df,prop):
    train, test = basicSplit(df,prop) #Split the data frame up
    trainToxic = train.loc[train["toxic"] == 1]#get all toxic comments from training set
    trainNotToxic = train.loc[~train.index.isin(trainToxic.index)]#get all non toxic comments from training
    train = trainToxic.append(trainNotToxic.head(len(trainToxic.index)))#create training set with all toxic training comments
                                                                        #and an equal amount of nonToxic comments
    return train,test

In [5]:
#Gets the word vector counts of the data sets- the data to be input into ogistic 
#df - the data frame to process (usually traing or testing data frame)
#vocab - Previous vocab to use (use training vocab for testing, use preselected words if words have been selected)
#lowercase - whether or not to take the lowercase of the words
#Returns: The TFVector and the list of vocab used
def process(df, vocab = None, lowercase = True):
    
    cv = CountVectorizer(stop_words = {"english"},lowercase=lowercase,vocabulary = vocab)#get count vectorizer
    wordVector = cv.fit_transform(df["comment_text"]) #get count vector of words
    tfidf_transformer = TfidfTransformer() #make tfidf_transformer
    TFVect = tfidf_transformer.fit_transform(wordVector)#perform tfidf transform
    
    if vocab == None: #get the list of vocab words if preset vocab was not used
        vocab = cv.get_feature_names()
        
    return TFVect,vocab

In [19]:
#Prints out the count of misclassified toxic and not toxic comments
#test: The test data set
#predict: The predictions made by the model
#tag: The type of comment classified (toxic, obscene, ect.)
def CountPredict (test,predict,tag):
    
    test.index = [x for x in range(0,len(test.index))] #reset the test index to make iteration easier
    
    count1 = 0 #total non-toxic count
    count2 = 0 #total toxic count
    count1f = 0 #Count of non-toxic comments classified as toxic
    count2f = 0 #Count of toxic comments classified as non-toxic
    
    #count toxic and non toxic data
    for i in range(len(predict)):
        
        if test.loc[i,tag] == 0:
                count1 +=1 
                if predict[i] != test.loc[i,tag]:
                    count1f += 1
                    
        if test.loc[i,tag] ==1:
            count2 += 1
            if predict[i] != test.loc[i,tag]:
                count2f +=1
                
    print ("Proportion Non-Toxic Misclassified: %i/%i" %(count1f, count1), "= ", (count1f/count1))
    print ("Proportion Toxic Misclassified: %i/%i" %(count2f, count2), "= ", (count2f/count2))

In [43]:
#Find a good logistic model using a reduced number of words
#df: The data frame
#tag: The category to model (toxic, obscene, ect)
#Return:
#bestFit: The best model found
#bestWords: The words used in the model
#test: The final test data set with predicted values appended
def top1400WordslogReg (df,tag):
    
    procAll, theVocab = process(df) #get the total vocab of data frame(used so that errors don't occur later)
    
    logistic = linear_model.LogisticRegression(penalty="l1") #make logistic modeler
   
    bestScore = 0 #The best score scene in the models generated
    bestFit = None #The model with the best score
    bestVocab = [] #The vocab of the best model
    
    #make 5 models and pick the one with the highest score
    for i in range(5):
        train,test = splitDF(df,.7)
        procTrain,vocab1 = process(train,theVocab)
        procTest,vocab = process(test,vocab1)
        logfit = logistic.fit(procTrain, train[tag])
        score = logfit.score(procTest,test[tag])
        

        if score > bestScore:
            bestScore = score
            bestFit = logfit
            bestVocab = [x for x in vocab1]
    
    #coefficients should map to words in the vocab, Check if they are equal other wise 
    #words selectd may not be random
    if len(bestVocab) != len(bestFit.coef_[0]):
        print("Warning: Vocab and Coefficient length not equal, something went wrong")
        print("Length Best Vocab: ", len(bestVocab))
        print("Length Logistic Coeficients: ", len(bestFit.coef_[0]))

    print("Score of word selecting model: ",bestScore)
    
    #Select words that contribuited signifcantly to the prediction (abs(coefficent) > .5)
    bestWords = []
    wordTouple = []
    for i in range(len(bestFit.coef_[0])):
        if bestFit.coef_[0][i] >=.5 or bestFit.coef_[0][i] <= -.5:
            bestWords.append(bestVocab[i])
            
        #uncomment to pick top 1000 words instead of using a thresh hold
        #wordTouple.append((abs(bestFit.coef_[0][i]),bestVocab[i]))
        
    #wordTouple.sort()
    #bestWords = [i for i in wordTouple[-1000:]]
    #print(wordTouple[-1])
    
    print("Number of words: ", len(bestWords))
    
    #Pick the best out of 10 models using only the selected words
    bestScore = 0
    for i in range(10):
        
        train,test = splitDF(df,.7)
        procTrain,vocab = process(train,bestWords)
        procTest,vocab = process(test,vocab)
        logfit = logistic.fit(procTrain, train[tag])
        score = logfit.score(procTest,test[tag])
        if score > bestScore:
            bestScore = score
            bestFit = logfit
            
  
    #run the prediction on the testing data of the last iteration of the loop for convience
    print ("Score of Final Model: ", bestScore)
    predict = bestFit.predict(procTest)
    CountPredict(test,predict,tag)
    
    #add the predected values to the test data frame and return it
    newRow = "predicted_" + tag
    test[newRow] = predict
    
    return bestFit, bestWords, test


In [41]:
dFile = "C:/Users/alexh/Downloads/train.csv"
df = pd.read_csv(dFile)
df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [46]:
output = top1400WordslogReg(df,"toxic")

Score of word selecting model:  0.916191431138
Number of words:  523
Score of Final Model:  0.91234776796
Proportion Non-Toxic Misclassified: 3908/43298 =  0.09025821054090258
Proportion Toxic Misclassified: 413/4573 =  0.09031270500765362


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [44]:
output[2]["predicted_toxic"]

0        1
1        1
2        1
3        0
4        1
5        0
6        0
7        1
8        1
9        1
10       1
11       0
12       0
13       1
14       1
15       0
16       0
17       0
18       1
19       1
20       1
21       0
22       1
23       0
24       0
25       0
26       1
27       0
28       1
29       0
        ..
47841    0
47842    1
47843    0
47844    1
47845    0
47846    1
47847    1
47848    1
47849    0
47850    0
47851    1
47852    1
47853    0
47854    1
47855    0
47856    0
47857    0
47858    1
47859    1
47860    0
47861    1
47862    1
47863    1
47864    1
47865    0
47866    1
47867    1
47868    1
47869    0
47870    1
Name: predicted_toxic, Length: 47871, dtype: int64