In [2]:
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
import matplotlib.pyplot as plt
import datetime
import random

%matplotlib inline

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer

from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import StandardScaler, label_binarize, MaxAbsScaler
from sklearn.cross_validation import train_test_split

import sklearn.metrics as skm
from sklearn.metrics import roc_curve, auc

from scipy import interp

from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [4]:
df = pd.read_csv('../data/Consumer_Complaints_with_Consumer_Complaint_Narratives.csv')

# Create 'df_text' for text modeling

In [5]:
def create_df_text(df):
    df_text = pd.DataFrame()  # Create empty df to fill
    
    df_text['Consumer complaint narrative'] = df['Consumer complaint narrative']
    
    cust_resp_dict ={'Closed':0,
                 'Untimely response':0,
                 'Closed with explanation':1,
                 'Closed with non-monetary relief':2,
                 'Closed with monetary relief':2}
    
    df_text['Company response to consumer'] = df['Company response to consumer'].apply(lambda x: cust_resp_dict[x])
    
    return df_text

In [6]:
create_df_text(df).head()

Unnamed: 0,Consumer complaint narrative,Company response to consumer
0,Received Capital One charge card offer XXXX. A...,1
1,I do n't know how they got my cell number. I t...,1
2,I 'm a longtime member of Charter One Bank/RBS...,1
3,"After looking at my credit report, I saw a col...",2
4,I received a call from a XXXX XXXX from XXXX @...,1


# Using Text Features

In [59]:
def prep_text_data(df):
    from sklearn.preprocessing import StandardScaler, label_binarize, MaxAbsScaler
#     print df.head()
    y_ = df['Company response to consumer']
    y = label_binarize(y_, classes = [0, 1, 2])
    n_classes=3 
    y = y.sum(axis=1)
    df.pop('Company response to consumer')
    X = df.values
    X = X.sum(axis=1)

    print X.shape, y.shape
    print y
    print X
    return X, y

In [60]:
df_text = create_df_text(df)
X, y = prep_text_data(df_text)

(84466,) (84466,)
[1 1 1 ..., 1 1 1]
[ 'Received Capital One charge card offer XXXX. Applied, was accepted ( {$500.00} limit ), activated card and used for XXXX presents. Charge card # XXXX. Right after activating card ... Capital One sent me another card with same {$500.00} limit ... never activated ... never used that card. First bill from above card # came due XXXX and minimum payment due was {$15.00}. I sent in {$20.00} via USPMO and sent in before due date. With the XXXX non-activated, non used credit card ... ..they also sent me bill for some yearly fees when never even activated the card. So called them up ... ... .told them did not want the card and sent back to them. Well ... .get my next bill from the card # above ( XXXX ) ... .they did not credit me for the {$20.00} payment and charged me outrageous over the limit fees, late fees, etc ... and now {$70.00} payment due. So, I called up, their rep stated they accidentally applied my {$20.00} payment to wrong account number and 

In [37]:
def train_test_split_function(X, y):
    from sklearn.cross_validation import train_test_split

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=11)
    
#     scaler = StandardScaler()
#     scaler.fit(X_train)
#     X_train = scaler.transform(X_train)
#     X_test = scaler.transform(X_test)
#     from sklearn.utils import shuffle
#     X_shuf, Y_shuf = shuffle(X_transformed, Y)


#     vectorizer = TfidfVectorizer(stop_words='english',lowercase=True, min_df=0.001, max_df = 0.2)
#     X_train = vectorizer.fit_transform(X_train)
#     X_test = vectorizer.transform(X_test)
    return X_train, X_test, y_train, y_test

In [38]:
X_train, X_test, y_train, y_test = train_test_split_function(X, y)

In [39]:
print X_train.shape
print X_test.shape
print y_train.shape
print y_test.shape

(59126,)
(25340,)
(59126,)
(25340,)


# Kaggle Amazon Food Stuff
https://www.kaggle.com/gpayen/d/snap/amazon-fine-food-reviews/building-a-prediction-model

In [57]:
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import numpy as np

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer

## Cleaning the data
To format our data and build the Term-doc incidence matrix, many operations will be performed on the data:

•Stemming
•Stop words removal
•Lowering
•Tokenization
•Pruning (numbers and punctuation)

In [58]:
stemmer = PorterStemmer()
from nltk.corpus import stopwords

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    #[line.decode('utf-8').strip() for line in title_file.readlines()]
    # tokens = [word for word in tokens if word not in stopwords.words('english')]
    stems = stem_tokens(tokens, stemmer)
    return ' '.join(stems)

from string import maketrans
intab = string.punctuation
outtab = "                                "
trantab = maketrans(intab, outtab)

## Training set
corpus = []
for text in X_train:
    text = text.lower()
    text = text.translate(trantab)
    text = tokenize(text)
    corpus.append(text)
    
count_vect = CountVectorizer(input="file")
X_train_counts = count_vect.fit_transform(corpus)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

## Testing set
test_set = []
for text in X_test:
    text = text.lower()
    text = text.translate(trantab)
    text.tokenize(text)
    test_set.append(text)
    
X_new_counts = count_vect.transform(test_set)
X_test_tfidf = tfidf_transformer.transform(X_new_counts)

from pandas import *
prediction = dict()


UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 0: ordinal not in range(128)

## Applying Multinomial Naïve Bayes learning method¶


In [None]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB().fit(X_train_tfidf, y_train)
prediction['Multinomial'] = model.predict(X_test_tfidf)

## Applying Bernoulli Naïve Bayes learning method¶

In [None]:
from sklearn.naive_bayes import BernoulliNB
model = BernoulliNB().fit(X_train_tfidf, y_train)
prediction['Bernoulli'] = model.predict(X_test_tfidf)

## Applying Logistic regression learning method¶


In [None]:
from sklearn import linear_model
logreg = linear_model.LogisticRegression(C=1e5)
logreg.fit(X_train_tfidf, y_train)
prediction['Logistic'] = logreg.predict(X_test_tfidf)

In [12]:
def formatt(x):
    if x == 'negative':
        return 0
    return 1
vfunc = np.vectorize(formatt)

cmp = 0
colors = ['b', 'g', 'y', 'm', 'k']
for model, predicted in prediction.items():
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test.map(formatt), vfunc(predicted))
    roc_auc = auc(false_positive_rate, true_positive_rate)
    plt.plot(false_positive_rate, true_positive_rate, colors[cmp], label='%s: AUC %0.2f'% (model,roc_auc))
    cmp += 1

plt.title('Classifiers comparaison with ROC')
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

NameError: name 'prediction' is not defined

In [None]:
print(metrics.classification_report(y_test, prediction['Logistic'], target_names = ["positive", "negative"]))

In [13]:
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(set(Score)))
    plt.xticks(tick_marks, set(Score), rotation=45)
    plt.yticks(tick_marks, set(Score))
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
# Compute confusion matrix
cm = confusion_matrix(y_test, prediction['Logistic'])
np.set_printoptions(precision=2)
plt.figure()
plot_confusion_matrix(cm)    

cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
plt.figure()
plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix')

plt.show()

NameError: name 'y_test' is not defined