In [None]:
# data manipulation
import pandas as pd
import numpy as np
import itertools

# explore
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# nlp
import unicodedata
import re
import nltk
from wordcloud import WordCloud
from nltk.corpus import stopwords
import nltk.sentiment
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# modeling
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import naive_bayes as nb
from sklearn.feature_selection import SelectKBest,chi2,mutual_info_classif

# local
import wrangle as w

# Acquire

In [None]:
# from google.oauth2 import service_account
# import pandas_gbq

# credentials = service_account.Credentials.from_service_account_file(
#     'my-ds-projects-d864a770b51b.json',
# )
# df = pandas_gbq.read_gbq("select * from cfpb_complaints.complaint_database",dialect='standard',project_id="my-ds-projects",use_bqstorage_api=True,credentials=credentials)
# df

In [None]:
# df = w.check_file_exists_gbq('cfpb.csv','service_key.json')
# df.head(5)

In [None]:
# df.info()

# Prepare

In [None]:
# df_clean = w.clean_data(df.copy())
# df_clean.head()

In [None]:
# df_clean.info()

In [None]:
# pd.set_option('display.max_colwidth',None)

In [None]:
# df_clean_sample = prep_narrative(df_clean.head(10000))
# df_clean_sample[['narrative','clean','lemon']].head(1)

In [None]:
# all_words = [word for row in df_clean_sample['clean'] for word in row.split()]
# all_words = all_words.sort()

In [None]:
# type(all_words)

In [None]:
# all_words.sort()

In [None]:
# all_words

In [None]:
# sample_str = df_clean.narrative[0]
# sample_str

In [None]:
# sample_str_sub = re.sub(r'[X{1,}\d\']', ' ', string=sample_str)
# sample_str_sub

In [None]:
# sample_str_clean = basic_clean(sample_str_sub)
# sample_str_clean

In [None]:
# sample_str_token = token_it_up(sample_str_clean)
# sample_str_token

In [None]:
# sample_str_stop = remove_stopwords(sample_str_token,["&#9;", "12", "'"])
# sample_str_stop

In [None]:
# sample_str_lem = lemmad(sample_str_stop)
# sample_str_lem

In [None]:
# df_lem = w.prep_narrative(df_clean)
# df_lem.head(1)

In [None]:
# df_lem.to_parquet('df_lem.parquet')

In [None]:
# df_lem = pd.read_parquet('df_lem.parquet')
# df_lem.head(1)

In [None]:
# df_lem = pd.read_parquet('df_prep.parquet')
# df_lem.head(1)

In [None]:
# df_lem.company_response_to_consumer.value_counts(normalize=True).plot(
#     kind='barh',
#     title='Percent of Company Responses to Customer',
#     xlabel='Percent of Responses',
#     ylabel='Company Response');

In [None]:
# df_lem.company_response_to_consumer.value_counts(normalize=True)

In [None]:
# all_words = [word for row in df_lem['lemon'] for word in row.split()]
# all_words.sort()
# all_words

# Explore

In [None]:
df_lem = pd.read_parquet('df_prep.parquet')
df_lem.head(1)

In [None]:
train, validate, test = w.split_data(df_lem,'company_response_to_consumer')

In [None]:
# train.head(1)

In [None]:
# train.company_response_to_consumer.value_counts(normalize=True)

In [None]:
# validate.company_response_to_consumer.value_counts(normalize=True)

In [None]:
# test.company_response_to_consumer.value_counts(normalize=True)

## 8.Which product is more likely to have monetary relief?

In [None]:
money = train[train.company_response_to_consumer=='Closed with monetary relief']
money.head(1)

In [None]:
money.product_bins.value_counts()

In [None]:
money.product_bins.value_counts(normalize=True)

In [None]:
cross = pd.crosstab(train['product_bins'],train['company_response_to_consumer'],normalize='index')
cross

In [None]:
cross['Closed with monetary relief'].sort_values()

In [None]:
cross['Closed with monetary relief'].sort_values().plot(kind='barh', title='Proportions of Monetary Relief', xlabel='Proportion of Complaints for the Product', ylabel='Product Type');

In [None]:
# credit card and bank related products have the highest chance of getting monetary relief at just under 20%
# it makes sense that credit report products have the least chance of getting monetary relief

In [None]:
def monetary_product(train):
    # make crosstab of product and responses and normalize to get product proportions
    cross = pd.crosstab(train['product_bins'],train['company_response_to_consumer'],normalize='index')
    # plot monetary relief products
    cross['Closed with monetary relief'].sort_values(
        ).plot(kind='barh', 
                title='Proportions of Monetary Relief', 
                xlabel='Proportion of Complaints for the Product', 
                ylabel='Product Type');

In [None]:
monetary_product(train)

## 9. Are there more complaints during certain seasons of the year?

In [None]:
# from datetime import datetime

In [None]:
# train.head(1)

In [None]:
# train['month'] = train.date_received.apply(lambda row: row.strftime("%m")).astype(str)
# train['year'] = train.date_received.apply(lambda row: row.strftime("%y")).astype(str)

In [None]:
# # Performed 1 aggregation grouped on columns: 'month', 'year'
# yearly = train.groupby(['month', 'year']).agg(year_count=('year', 'count'))

# # Performed 1 aggregation grouped on column: 'month'
# monthly = yearly.groupby(['month']).agg(year_count_mean=('year_count', 'mean'))

In [None]:
# train.groupby(['month']).agg(month_count=('month', 'count')).plot(kind='bar')

# Modeling

In [None]:
import model as m

In [None]:
def encode(df):
    '''Encode categorical columns'''
    # columns to encode
    cols = ['tags','product_bins']
    # encode the dummies
    dummy = pd.get_dummies(df[cols],prefix='',prefix_sep='',drop_first=True)
    # bring the dummies along
    return pd.concat([df,dummy],axis=1)

In [None]:
X_train = encode(train)
X_train = X_train.drop(columns=['date_received','company_response_to_consumer','clean','state','company_name','tags','product_bins'])
y_train = train['company_response_to_consumer']
X_val = encode(validate)
X_val = X_val.drop(columns=['date_received','company_response_to_consumer','clean','state','company_name','tags','product_bins'])
y_val = validate['company_response_to_consumer']
X_test = encode(test)
X_test = X_test.drop(columns=['date_received','company_response_to_consumer','clean','state','company_name','tags','product_bins'])
y_test = test['company_response_to_consumer']
X_train.head()

In [None]:
def make_cv(Xtr,Xv,Xt):
    """
    The function `make_cv` takes in three sets of data (train, validation, and test) and converts them
    into bag-of-words representations using a CountVectorizer with n-gram range of 1 to 3, and then
    returns the transformed data as dataframes.
    
    :param Xtr: Xtr is the training data, which is a pandas DataFrame containing the lemmatized text
    data
    :param Xv: Xv is the validation dataset, which is used to evaluate the performance of the model
    during training. It is a subset of the overall dataset that is not used for training the model but
    is used to tune the hyperparameters and assess the model's generalization ability
    :param Xt: Xt is the test data, which is a dataframe containing the text data that you want to
    classify or analyze
    :return: three dataframes: Xtr_cv, Xv_cv, and Xt_cv.
    """
    #make my bag of words up to trigrams cv and keep single characters
    cv = CountVectorizer(token_pattern=r'(?u)\b\w+\b', max_features=2900)
    # fit and transform train
    Xtr_bow_cv = cv.fit_transform(Xtr.lemon)
    # transform val and test
    Xv_bow_cv = cv.transform(Xv.lemon)
    Xt_bow_cv = cv.transform(Xt.lemon)
    # make dfs
    Xtr_cv = pd.DataFrame(Xtr_bow_cv.todense(),columns=cv.get_feature_names_out(),index=Xtr.index)
    Xv_cv = pd.DataFrame(Xv_bow_cv.todense(),columns=cv.get_feature_names_out(),index=Xv.index)
    Xt_cv = pd.DataFrame(Xt_bow_cv.todense(),columns=cv.get_feature_names_out(),index=Xt.index)
    return Xtr_cv,Xv_cv,Xt_cv


def make_tfidf(Xtr,Xv,Xt):
    """
    The function `make_tfidf` takes in three sets of data (train, validation, and test) and applies the
    TF-IDF vectorization technique to convert the text data into numerical features, using n-grams up to
    trigrams and keeping single characters. It then returns the transformed data as pandas DataFrames.
    
    :param Xtr: Xtr is the training data, which is a dataframe containing the text data that you want to
    transform into TF-IDF features. The "lemmatized" column in the dataframe contains the preprocessed
    text data
    :param Xv: Xv is the validation dataset, which is used to evaluate the performance of the model
    during training
    :param Xt: Xt is the input data for the test set. It is a dataframe containing the text data that
    needs to be transformed into TF-IDF representation
    :return: three dataframes: Xtr_tfidf, Xv_tfidf, and Xt_tfidf.
    """
    #make my bag of words up to trigrams tfidf and keep single characters
    tfidf = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b', max_features=2900)
    # fit and transform train
    Xtr_bow_tfidf = tfidf.fit_transform(Xtr.lemon)
    # transform val and test
    Xv_bow_tfidf = tfidf.transform(Xv.lemon)
    Xt_bow_tfidf = tfidf.transform(Xt.lemon)
    # make dfs
    Xtr_tfidf = pd.DataFrame(Xtr_bow_tfidf.todense(),columns=tfidf.get_feature_names_out(),index=Xtr.index)
    Xv_tfidf = pd.DataFrame(Xv_bow_tfidf.todense(),columns=tfidf.get_feature_names_out(),index=Xv.index)
    Xt_tfidf = pd.DataFrame(Xt_bow_tfidf.todense(),columns=tfidf.get_feature_names_out(),index=Xt.index)
    return Xtr_tfidf,Xv_tfidf,Xt_tfidf

In [None]:
X_train_cv,X_val_cv,X_test_cv = make_cv(X_train[['lemon']], X_val[['lemon']], X_test[['lemon']])

In [None]:
X_train_cv.head()

In [None]:
X_train_tf,X_val_tf,X_test_tf = make_tfidf(X_train, X_val, X_test)

In [None]:
X_train_tf.head()

In [None]:
X_train_cve = pd.concat([X_train.iloc[:,1:],X_train_cv],left_index=True, right_index=True)
X_train_cve.head()

In [None]:
X_val_cve = pd.concat([X_val.iloc[:,1:],X_val_cv],axis=1)
X_test_cve = pd.concat([X_test.iloc[:,1:],X_test_cv],axis=1)
X_train_tfe = pd.concat([X_train.iloc[:,1:],X_train_tf],axis=1)
X_val_tfe = pd.concat([X_val.iloc[:,1:],X_val_tf],axis=1)
X_test_tfe = pd.concat([X_test.iloc[:,1:],X_test_tf],axis=1)

In [None]:
def select_kbest(X, y, k=2, scoring=chi2):
    '''
    will take in two pandas objects:
    X: a dataframe representing numerical independent features
    y: a pandas Series representing a target variable
    k: a keyword argument defaulted to 2 for the number of ideal features we elect to select
    scoring: scoring type, default chi2, other category mutual_info_classif
    ---
    return: a df of the selected features from the SelectKBest process
    ---
    Format: kbest_results = function()
    '''
    kbest = SelectKBest(scoring, k=k)
    kbest.fit(X, y)
    mask = kbest.get_support()
    kbest_results = pd.DataFrame(
                dict(p_value=kbest.pvalues_, feature_score=kbest.scores_),
                index = X.columns)
    return kbest_results.sort_values(by=['feature_score'], ascending=False).head(k)

In [None]:
def tree_models(Xtr,ytr,Xv,yv):
    metrics = []
    # cycle through depth,leaf,class_weight for dec tree
    for d,l,cw in itertools.combinations(range(1,21),range(1,21),['balanced',None]):
        # decision tree
        tree = DecisionTreeClassifier(max_depth=d, min_samples_leaf=l,class_weight=cw,random_state=123)
        tree.fit(Xtr,ytr)
        # accuracies
        ytr_acc = tree.score(Xtr,ytr)
        yv_acc = tree.score(Xv,yv)
        # table-ize
        output ={
                'model':'Decision Tree',
                'params':f"max_depth={d},min_samples_leaf={l},class_weight={cw},random_state=123",
                'tr_acc':ytr_acc,
                'v_acc':yv_acc,
            }
        metrics.append(output)
    return pd.DataFrame(metrics)

def forest_models(Xtr,ytr,Xv,yv):
    metrics = []
    # cycle through depth,leaf,class_weight for random forest
    for d,l,cw in itertools.combinations(range(1,21),range(1,21),['balanced','balanced_subsample',None]):
        # random forest
        forest = RandomForestClassifier(max_depth=d, min_samples_leaf=l,class_weight=cw,random_state=123)
        forest.fit(Xtr,ytr)
        # accuracies
        ytr_acc = forest.score(Xtr,ytr)
        yv_acc = forest.score(Xv,yv)
        # table-ize
        output ={
                'model':'Random Forest',
                'params':f"max_depth={d},min_samples_leaf={l},class_weight={cw},random_state=123",
                'tr_acc':ytr_acc,
                'v_acc':yv_acc,
            }
        metrics.append(output)
    return pd.DataFrame(metrics)

def knn_models(Xtr,ytr,Xv,yv):
    metrics = []
    # cycle through neighbors and weights for knn
    for n,w in itertools.combinations(range(1,21),['uniform', 'distance']):
        # knn
        forest = KNeighborsClassifier(n_neighbors=n,weights=w)
        forest.fit(Xtr,ytr)
        # accuracies
        ytr_acc = forest.score(Xtr,ytr)
        yv_acc = forest.score(Xv,yv)
        # table-ize
        output ={
                'model':'KNN',
                'params':f"n_neighbors={n},weights={w}",
                'tr_acc':ytr_acc,
                'v_acc':yv_acc,
            }
        metrics.append(output)
    return pd.DataFrame(metrics)

def log_models(Xtr,ytr,Xv,yv):
    metrics = []
    # cycle through C,class_weight for log reg
    for c,cw in itertools.combinations([.01,.1,1,10,100,1000],['balanced',None]):
        # logistic regression
        lr = LogisticRegression(C=c,class_weight=cw,random_state=123,max_iter=500)
        lr.fit(Xtr,ytr)
        # accuracies
        ytr_acc = lr.score(Xtr,ytr)
        yv_acc = lr.score(Xv,yv)
        # table-ize
        output ={
                'model':'LogReg',
                'params':f"C={c},class_weight={cw},random_state=123,max_iter=500",
                'tr_acc':ytr_acc,
                'v_acc':yv_acc,
            }
        metrics.append(output)
    return pd.DataFrame(metrics)

def comp_nb_models(Xtr,ytr,Xv,yv):
    # naive bayes complement
    cnb = nb.ComplementNB(alpha=0,force_alpha=True)
    cnb.fit(Xtr,ytr)
    # accuracies
    ytr_acc = cnb.score(Xtr,ytr)
    yv_acc = cnb.score(Xv,yv)
    # table-ize
    output ={
            'model':'CNB',
            'params':f'alpha={a},force_alpha=True',
            'tr_acc':ytr_acc,
            'v_acc':yv_acc,
        }
    metrics = [output]
    # cycle through alpha for CNB
    for a in np.arange(.1,.6,.1):
        # naive bayes complement
        cnb = nb.ComplementNB(alpha=a)
        cnb.fit(Xtr,ytr)
        # accuracies
        ytr_acc = cnb.score(Xtr,ytr)
        yv_acc = cnb.score(Xv,yv)
        # table-ize
        output ={
                'model':'CNB',
                'params':f'alpha={a}',
                'tr_acc':ytr_acc,
                'v_acc':yv_acc,
            }
        metrics.append(output)
    return pd.DataFrame(metrics)

def multi_nb_models(Xtr,ytr,Xv,yv):
    # naive bayes multinomial
    mnb = nb.MultinomialNB(alpha=0,force_alpha=True)
    mnb.fit(Xtr,ytr)
    # accuracies
    ytr_acc = mnb.score(Xtr,ytr)
    yv_acc = mnb.score(Xv,yv)
    # table-ize
    output ={
            'model':'MNB',
            'params':f'alpha={a},force_alpha=True',
            'tr_acc':ytr_acc,
            'v_acc':yv_acc,
        }
    metrics = [output]
    # cycle through alpha for MNB
    for a in np.arange(.1,.6,.1):
        # naive bayes multinomial
        mnb = nb.MultinomialNB(alpha=a)
        mnb.fit(Xtr,ytr)
        # accuracies
        ytr_acc = mnb.score(Xtr,ytr)
        yv_acc = mnb.score(Xv,yv)
        # table-ize
        output ={
                'model':'MNB',
                'params':f'alpha={a}',
                'tr_acc':ytr_acc,
                'v_acc':yv_acc,
            }
        metrics.append(output)
    return pd.DataFrame(metrics)

def cat_nb_models(Xtr,ytr,Xv,yv):
    # naive bayes categorical
    cat = nb.CategoricalNB(alpha=0,force_alpha=True)
    cat.fit(Xtr,ytr)
    # accuracies
    ytr_acc = cat.score(Xtr,ytr)
    yv_acc = cat.score(Xv,yv)
    # table-ize
    output ={
            'model':'CatNB',
            'params':f'alpha={a},force_alpha=True',
            'tr_acc':ytr_acc,
            'v_acc':yv_acc,
        }
    metrics = [output]
    # cycle through alpha for CatNB
    for a in np.arange(.1,.6,.1):
        # naive bayes categorical
        cat = nb.CategoricalNB(alpha=a)
        cat.fit(Xtr,ytr)
        # accuracies
        ytr_acc = cat.score(Xtr,ytr)
        yv_acc = cat.score(Xv,yv)
        # table-ize
        output ={
                'model':'CatNB',
                'params':f'alpha={a}',
                'tr_acc':ytr_acc,
                'v_acc':yv_acc,
            }
        metrics.append(output)
    return pd.DataFrame(metrics)