In [None]:
import wrangle as w
import pandas as pd
import numpy as np
import matplotlib as plt


## Acquire 

In [None]:
df=w.check_file_exists_gbq('cfpb.csv','service_key.json')

In [None]:
df.head()

## Pre cleaning

In [None]:
df.tags.value_counts(dropna=False)

In [None]:
df.tags=df.tags.fillna('Normal')

In [None]:
df.tags.value_counts(dropna=False)

In [None]:
df.submitted_via.value_counts(dropna=False)

In [None]:
df.consumer_disputed.value_counts(dropna=False)

In [None]:
df.sample(5)

In [None]:
df.consumer_complaint_narrative.count()

## Clean Data:

In [None]:
df=w.clean_data(df)
df

In [None]:
df.	company_name.value_counts()

In [None]:
boa_df=df[df.company_name=='BANK OF AMERICA, NATIONAL ASSOCIATION']


In [None]:
boa_df.date_received.value_counts()

 ## NLTK Language Cleaning


In [None]:
df=w.prep_narrative(df)
df.head()

## Split Data

In [None]:
train,validate,test=w.split_data(df,'company_response_to_consumer')

In [None]:
train.to_parquet('train.parquet')
validate.to_parquet('validate.parquet')
test.to_parquet('test.parquet')
train = pd.read_parquet('train.parquet')
validate = pd.read_parquet('validate.parquet')
test = pd.read_parquet('test.parquet')

# Exploration:

In [1]:
import wrangle as w
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats as stats
import nltk
alpha=0.05

imports loaded successfully, awaiting commands...


In [2]:
train = pd.read_parquet('train.parquet')
validate = pd.read_parquet('validate.parquet')
test = pd.read_parquet('test.parquet')

In [3]:
sentiment_df=w.sentiment_analysis(train)

KeyboardInterrupt: 

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
sentiment_df.head()

## 3. Do narratives with a neutral or positive sentiment analysis relating to bank account products lead to a response of closed with monetary relief?

In [None]:
sentiment_df.company_response_to_consumer.value_counts()

In [None]:
sentiment_df.product_bins.value_counts()

In [None]:
cwe_df=sentiment_df[sentiment_df.company_response_to_consumer=='Closed with explanation']

In [None]:

sns.boxplot(data=sentiment_df, x='product_bins', y='sentiment', hue='company_response_to_consumer')
plt.show()

In [None]:
sns.relplot(data=sentiment_df, x='product_bins', y='sentiment', hue='company_response_to_consumer')
plt.show()

In [None]:

sns.kdeplot(sentiment_df[sentiment_df.company_response_to_consumer=='Closed with monetary relief'].sentiment, label='money')
sns.kdeplot(sentiment_df[sentiment_df.company_response_to_consumer=='Closed with non-monetary relief'].sentiment, label='no money')
sns.kdeplot(sentiment_df[sentiment_df.company_response_to_consumer=='Closed with explanation'].sentiment, label='explanation')
sns.kdeplot(sentiment_df[sentiment_df.company_response_to_consumer=='Untimely response'].sentiment, label='late response')
sns.kdeplot(sentiment_df[sentiment_df.company_response_to_consumer=='Closed'].sentiment, label='no explanation')
plt.legend(['money', 'no money','explanation','late response','no explanation'])
plt.show()

In [None]:
sns.color_palette()

In [None]:

# Set the figure size
plt.figure(figsize=(10, 6))

# Customize the plot style
sns.set(style="whitegrid")

# Create the bar plot
sns.barplot(data=sentiment_df, x='product_bins', y='sentiment', hue='company_response_to_consumer', ci=None, color='purple')

# Set the labels and title
plt.xlabel('Product Bins')
plt.ylabel('Sentiment')
plt.title('Company Response to Consumer and Sentiment across Product Bins')

# Adjust the legend position
plt.legend(loc='best')

# Show the plot
plt.show()


In [None]:
# Create example data
group1 = np.random.normal(loc=10, scale=2, size=100)
group2 = np.random.normal(loc=12, scale=2, size=100)

# Calculate the theoretical means for each group
theoretical_mean_group1 = np.mean(group1)
theoretical_mean_group2 = np.mean(group2)

# Combine the group data and assign a group label
data = pd.DataFrame({'group1': group1,
                     'group2': group2})

# Set the significance level (alpha)
alpha = 0.05

# Perform Levene test for variance comparison
tstat, pvalue = stats.levene(group1, group2)

print("Running Levene Test...")
if pvalue > alpha:
    print(f'p-value: {pvalue:.10f} > {alpha}?')
    print("Variance is true, proceed with ANOVA test...")
else:
    print("p-value:", pvalue)
    print("Variance is not true. Consider alternative tests for comparing groups.")


In [None]:

# Get unique categories of product_bins
unique_bins = sentiment_df['product_bins'].unique()

# Perform ANOVA test for each category of product_bins
for bin_category in unique_bins:
    # Create a subset of the data for the specific product_bins category
    subset = sentiment_df[sentiment_df['product_bins'] == bin_category]

    # Perform one-way ANOVA for the subset
    result = stats.f_oneway(*[subset[subset['company_response_to_consumer'] == response]['sentiment']
                              for response in subset['company_response_to_consumer'].unique()])

    # Print the ANOVA test result for the subset
    print("Product Bins:", bin_category)
    print("ANOVA p-value:", result.pvalue)

    if result.pvalue < alpha:
        print("The p-value is less than alpha. There is a significant effect of sentiment on company response to the consumer.")
    else:
        print("The p-value is greater than or equal to alpha. There is no significant effect of sentiment on company response to the consumer.")

    print()  # Print an empty line between each category's results


## 6.Does narrative length relate to company response?

In [None]:
sentiment_df['message_length'] = sentiment_df['clean'].apply(len)

In [None]:
sentiment_df['word_count'] = sentiment_df['lemon'].apply(len)

In [None]:
sentiment_df['word_count'] = sentiment_df['lemon'].apply(lambda x: len(x.split()))


sentiment_df['word_count'] = sentiment_df.narrative.apply(w.basic_clean).apply(len)

In [None]:
sentiment_df.head(2)

In [None]:
sns.relplot(data=sentiment_df, x='message_length', y='sentiment', hue='company_response_to_consumer')
plt.show()

In [None]:
sns.relplot(data=sentiment_df, x='word_count', y='sentiment', hue='company_response_to_consumer')
plt.show()

In [None]:
plt.cm.list_available()

In [None]:
plt.scatter(sentiment_df['message_length'], sentiment_df['company_response_to_consumer'],cmap='Set1')

plt.xlabel('Message Length')
plt.ylabel('Company Response to Consumer')
plt.title('Relationship between Message Length and Company Response to Consumer')

plt.show()

In [None]:
plt.scatter(sentiment_df['word_count'], sentiment_df['company_response_to_consumer'],cmap='red')

plt.xlabel('Word Count')
plt.ylabel('Company Response to Consumer')
plt.title('Relationship between Word Count and Company Response to Consumer')

plt.show()

In [None]:

result = stats.f_oneway(*[sentiment_df[sentiment_df['company_response_to_consumer'] == response]['message_length']
                          for response in sentiment_df['company_response_to_consumer'].unique()])

p_value = result.pvalue


print("ANOVA p-value:", p_value)
if p_value < alpha:
    print("The p-value is less than alpha. There is a significant relationship between message length and company response to the consumer.")
else:
    print("The p-value is greater than or equal to alpha. There is no significant relationship between message length and company response to the consumer.")

## Testing Functions

In [None]:
def analyze_sentiment(sentiment_df, alpha=0.05,truncate=False):
    """Analyzes sentiment and company response to consumer across product bins.
    This function answers the question: Do narratives with a neutral or positive sentiment
    analysis relating to bank account products lead to a response of closed with monetary relief?"""

    # Set the figure size
    plt.figure(figsize=(10, 6))

    # Customize the plot style
    sns.set(style="whitegrid")

    # Create the bar plot
    sns.barplot(data=sentiment_df, x='product_bins', y='sentiment', hue='company_response_to_consumer', ci=None, color='purple')

    # Set the labels and title
    plt.xlabel('Product Bins')
    plt.ylabel('Sentiment')
    plt.title('Company Response to Consumer and Sentiment Analysis across Product Bins')

    # Adjust the legend position
    plt.legend(loc='best')

    # Show the plot
    plt.show()

   # Create example data for Levene test
    group1 = np.random.normal(loc=10, scale=2, size=100)
    group2 = np.random.normal(loc=12, scale=2, size=100)

    # Calculate the theoretical means for each group
    theoretical_mean_group1 = np.mean(group1)
    theoretical_mean_group2 = np.mean(group2)

    # Perform Levene test for variance comparison
    tstat, pvalue = stats.levene(group1, group2)

    print("Running Levene Test...")
    if pvalue > alpha:
        print(f'p-value: {pvalue:.10f} > {alpha}?')
        print()
        print("Variance is true, proceed with ANOVA test...")
        print()
    else:
        print("p-value:", pvalue)
        print()
        print("Variance is not true. Consider alternative tests for comparing groups.")
        print()

    # Get unique categories of product_bins
    unique_bins = sentiment_df['product_bins'].unique()

    # Perform ANOVA test for each category of product_bins
    for bin_category in unique_bins:
        # Create a subset of the data for the specific product_bins category
        subset = sentiment_df[sentiment_df['product_bins'] == bin_category]

        # Perform one-way ANOVA for the subset
        result = stats.f_oneway(*[subset[subset['company_response_to_consumer'] == response]['sentiment']
                                  for response in subset['company_response_to_consumer'].unique()])

        # Print the ANOVA test result for the subset
        print("Product Bins:", bin_category)
        print("ANOVA p-value:", result.pvalue)

        if result.pvalue < alpha:
            print("The p-value is less than alpha. There is a significant effect of sentiment on company response to the consumer.")
        else:
            print("The p-value is greater than or equal to alpha. There is no significant effect of sentiment on company response to the consumer.")

        print()  # Print an empty line between each category's results


In [None]:
analyze_sentiment(sentiment_df)

### Takeaways:
- Overall, there is a strong correlation between the sentiment of consumer complaints/narratives and the corresponding responses from companies.

1. Mortgage:
  - Consumer complaints/narratives exhibit predominantly positive sentiment, and companies provide an equal distribution of responses across different categories.
  
2. Credit Report:
  - Consumer complaints/narratives with positive sentiment tend to receive the "closed with monetary relief" response most frequently.
  - Overall, the sentiment of complaints/narratives is generally neutral to positive.
  
3. Debt Collection:
  - All consumer complaints/narratives have negative sentiment scores, and the complaints with the most negative scores typically receive an "untimely response."
  
4. Loans:
  - Complaints/narratives regarding loans have sentiment scores ranging from neutral to positive. Companies provide different responses irrespective of the sentiment score.
   
5. Bank:
  - Sentiment scores for bank-related complaints/narratives are somewhat mixed, ranging from neutral to negative. The more negative complaints tend to receive a "closed" or "untimely response."
  
6. Money Service:
  - Sentiment scores for complaints/narratives about money services vary between negative and positive. The most negative complaints receive a "closed" response.
  
7. Credit Card:
 - The majority of sentiment scores for credit card complaints/narratives range from neutral to positive. The most common response received by consumers is "closed with non-monetary relief."
 
 - These findings indicate that the sentiment of consumer complaints/narratives has an influence on the type of response received from companies across different industry sectors.

## Summary 
* The analysis revealed a significant relationship between consumer sentiment in complaints/narratives and the corresponding company responses, indicating the importance of sentiment in consumer-company interactions.
* Sentiment patterns varied across industries, with positive sentiment in mortgage complaints, credit report complaints receiving "closed with monetary relief" responses, and consistently negative sentiment in debt collection complaints leading to "untimely response" from companies. These findings highlight the need to consider sentiment for effective consumer grievance resolution.

In [None]:
def analyze_message_length(sentiment_df, alpha=0.05):
    """
    Analyzes the relationship between message length and company response to the consumer.
    This function answers the question: Does narrative length relate to company response?
    """

    # Create the scatter plot
    plt.scatter(sentiment_df['message_length'], sentiment_df['company_response_to_consumer'], cmap='Set1')

    # Set the labels and title
    plt.xlabel('Message Length')
    plt.ylabel('Company Response to Consumer')
    plt.title('Relationship between Message Length and Company Response to Consumer')

    # Show the plot
    plt.show()

    # Perform ANOVA test
    # The code then uses a list comprehension to iterate over each unique category.
    result = stats.f_oneway(*[sentiment_df[sentiment_df['company_response_to_consumer'] == response]['message_length']
                              for response in sentiment_df['company_response_to_consumer'].unique()])

    p_value = result.pvalue

    print("ANOVA p-value:", p_value)
    if p_value < alpha:
        print("The p-value is less than alpha. There is a significant relationship between message length and company response to the consumer.")
    else:
        print("The p-value is greater than or equal to alpha. There is no significant relationship between message length and company response to the consumer.")

In [None]:
analyze_message_length(sentiment_df)

In [None]:
def analyze_word_count(sentiment_df, alpha=0.05):
    """
    Analyzes the relationship between word count and company response to the consumer.
    This function answers the question: Does narrative word count relate to company response?
    """

    # Create the scatter plot
    plt.scatter(sentiment_df['message_length'], sentiment_df['company_response_to_consumer'], cmap='Set1')

    # Set the labels and title
    plt.xlabel('Message Length')
    plt.ylabel('Company Response to Consumer')
    plt.title('Relationship between Message Length and Company Response to Consumer')

    # Show the plot
    plt.show()

    # Perform ANOVA test
    # The code then uses a list comprehension to iterate over each unique category.
    result = stats.f_oneway(*[sentiment_df[sentiment_df['company_response_to_consumer'] == response]['message_length']
                              for response in sentiment_df['company_response_to_consumer'].unique()])

    p_value = result.pvalue

    print("ANOVA p-value:", p_value)
    if p_value < alpha:
        print("The p-value is less than alpha. There is a significant relationship between message length and company response to the consumer.")
    else:
        print("The p-value is greater than or equal to alpha. There is no significant relationship between message length and company response to the consumer.")
    # Create the scatter plot
    plt.scatter(sentiment_df['word_count'], sentiment_df['company_response_to_consumer'], cmap='Reds')

    # Set the labels and title
    plt.xlabel('Word Count')
    plt.ylabel('Company Response to Consumer')
    plt.title('Relationship between Word Count and Company Response to Consumer')

    # Show the plot
    plt.show()

    # Perform ANOVA test
    # The code uses a list comprehension to iterate over each unique category.
    result = stats.f_oneway(*[sentiment_df[sentiment_df['company_response_to_consumer'] == response]['word_count']
                              for response in sentiment_df['company_response_to_consumer'].unique()])

    p_value = result.pvalue

    print("ANOVA p-value:", p_value)
    if p_value < alpha:
        print("The p-value is less than alpha. There is a significant relationship between word count and company response to the consumer.")
    else:
        print("The p-value is greater than or equal to alpha. There is no significant relationship between word count and company response to the consumer.")

In [None]:
analyze_word_count(sentiment_df)

## Modeling attempts

In [3]:
import model as m

In [5]:
train.head()

Unnamed: 0,date_received,company_name,state,tags,company_response_to_consumer,product_bins,clean,lemon
944870,2015-05-12,"CITIBANK, N.A.",CA,Average Person,Closed with monetary relief,mortgage,it has been since that i first applied for the...,since first applied refinance today decision c...
570225,2021-07-06,"EQUIFAX, INC.",CA,Average Person,Closed with explanation,credit_report,inquiry ive called and informed them that i di...,inquiry ive called informed applied car inquir...
523256,2017-10-11,"Franklin Collection Service, Inc.",GA,Average Person,Closed with explanation,debt_collection,i paid the bill in hopes to get it back but th...,paid bill hope get back said pay different ins...
753529,2017-12-18,Credit Plus Inc,CA,Average Person,Closed with explanation,credit_report,unknown inquiry on my credit reports from cred...,unknown inquiry credit report credit plus auth...
999772,2016-06-17,Selene Finance LP,OR,Average Person,Closed with explanation,mortgage,selene finance will not help me figure out why...,selene finance help figure many different amou...


In [None]:
X_train = m.encode(train)
X_train = X_train.drop(columns=['date_received','company_response_to_consumer','clean','state','company_name','tags','product_bins'])
y_train = train['company_response_to_consumer']
X_val = m.encode(validate)
X_val = X_val.drop(columns=['date_received','company_response_to_consumer','clean','state','company_name','tags','product_bins'])
y_val = validate['company_response_to_consumer']
X_test = m.encode(test)
X_test = X_test.drop(columns=['date_received','company_response_to_consumer','clean','state','company_name','tags','product_bins'])
y_test = test['company_response_to_consumer']
X_train.head()

In [6]:
import polars as pl

In [9]:
X_train = train[['lemon']]
y_train = train['company_response_to_consumer']
X_val = validate[['lemon']]
y_val = validate['company_response_to_consumer']
X_test = test[['lemon']]
y_test = test['company_response_to_consumer']
X_train.head()

Unnamed: 0,lemon
944870,since first applied refinance today decision c...
570225,inquiry ive called informed applied car inquir...
523256,paid bill hope get back said pay different ins...
753529,unknown inquiry credit report credit plus auth...
999772,selene finance help figure many different amou...


In [8]:
X_train_cv,X_val_cv,X_test_cv = m.make_cv(X_train, X_val, X_test)
X_train_tf,X_val_tf,X_test_tf = m.make_tfidf(X_train, X_val, X_test)

In [9]:
def tree_models(Xtr,ytr,Xv,yv):
    metrics = []
# cycle through depth, leaf, class_weight for dec tree
    for d, l, cw in itertools.product(range(1, 6), range(1, 6), ['balanced', None]):
        # decision tree
        tree = DecisionTreeClassifier(max_depth=d, min_samples_leaf=l, class_weight=cw, random_state=123)
        tree.fit(Xtr, ytr)
        # accuracies
        ytr_acc = tree.score(Xtr, ytr)
        yv_acc = tree.score(Xv, yv)
        # table-ize
        output = {
            'model': 'Decision Tree',
            'params': f"max_depth={d}, min_samples_leaf={l}, class_weight={cw}, random_state=123",
            'tr_acc': ytr_acc,
            'v_acc': yv_acc,
        }
        metrics.append(output)
    return pd.DataFrame(metrics)

In [None]:
import itertools
from sklearn.tree import DecisionTreeClassifier

tree_models(X_train_cv,y_train,X_val_cv,y_val)

In [11]:
sm_train1 = int(round(len(train[train.company_response_to_consumer=='Closed with explanation'])*.2,0))
sm_train2 = int(round(len(train[train.company_response_to_consumer=='Closed with non-monetary relief'])*.2,0))
sm_train3 = int(round(len(train[train.company_response_to_consumer=='Closed with monetary relief'])*.2,0))
sm_train4 = int(round(len(train[train.company_response_to_consumer=='Untimely response'])*.2,0))
sm_train5 = int(round(len(train[train.company_response_to_consumer=='Closed'])*.2,0))
sm_val1 = int(round(len(validate[validate.company_response_to_consumer=='Closed with explanation'])*.2,0))
sm_val2 = int(round(len(validate[validate.company_response_to_consumer=='Closed with non-monetary relief'])*.2,0))
sm_val3 = int(round(len(validate[validate.company_response_to_consumer=='Closed with monetary relief'])*.2,0))
sm_val4 = int(round(len(validate[validate.company_response_to_consumer=='Untimely response'])*.2,0))
sm_val5 = int(round(len(validate[validate.company_response_to_consumer=='Closed'])*.2,0))
sm_test1 = int(round(len(test[test.company_response_to_consumer=='Closed with explanation'])*.2,0))
sm_test2 = int(round(len(test[test.company_response_to_consumer=='Closed with non-monetary relief'])*.2,0))
sm_test3 = int(round(len(test[test.company_response_to_consumer=='Closed with monetary relief'])*.2,0))
sm_test4 = int(round(len(test[test.company_response_to_consumer=='Untimely response'])*.2,0))
sm_test5 = int(round(len(test[test.company_response_to_consumer=='Closed'])*.2,0))

small_train1 = train[train.company_response_to_consumer=='Closed with explanation'].sample(sm_train1,random_state=123)
small_train2 = train[train.company_response_to_consumer=='Closed with non-monetary relief'].sample(sm_train2,random_state=123)
small_train3 = train[train.company_response_to_consumer=='Closed with monetary relief'].sample(sm_train3,random_state=123)
small_train4 = train[train.company_response_to_consumer=='Untimely response'].sample(sm_train4,random_state=123)
small_train5 = train[train.company_response_to_consumer=='Closed'].sample(sm_train5,random_state=123)
small_val1 = validate[validate.company_response_to_consumer=='Closed with explanation'].sample(sm_val1,random_state=123)
small_val2 = validate[validate.company_response_to_consumer=='Closed with non-monetary relief'].sample(sm_val2,random_state=123)
small_val3 = validate[validate.company_response_to_consumer=='Closed with monetary relief'].sample(sm_val3,random_state=123)
small_val4 = validate[validate.company_response_to_consumer=='Untimely response'].sample(sm_val4,random_state=123)
small_val5 = validate[validate.company_response_to_consumer=='Closed'].sample(sm_val5,random_state=123)
small_test1 = test[test.company_response_to_consumer=='Closed with explanation'].sample(sm_test1,random_state=123)
small_test2 = test[test.company_response_to_consumer=='Closed with non-monetary relief'].sample(sm_test2,random_state=123)
small_test3 = test[test.company_response_to_consumer=='Closed with monetary relief'].sample(sm_test3,random_state=123)
small_test4 = test[test.company_response_to_consumer=='Untimely response'].sample(sm_test4,random_state=123)
small_test5 = test[test.company_response_to_consumer=='Closed'].sample(sm_test5,random_state=123)

small_train = pd.concat([small_train1,small_train2,small_train3,small_train4,small_train5])
small_val = pd.concat([small_val1,small_val2,small_val3,small_val4,small_val5])
small_test = pd.concat([small_test1,small_test2,small_test3,small_test4,small_test5])

X_train = m.encode(small_train)
X_train = X_train.drop(columns=['date_received','company_response_to_consumer','clean','state','company_name','tags','product_bins'])
y_train = small_train['company_response_to_consumer']
X_val = m.encode(small_val)
X_val = X_val.drop(columns=['date_received','company_response_to_consumer','clean','state','company_name','tags','product_bins'])
y_val = small_val['company_response_to_consumer']
X_test = m.encode(small_test)
X_test = X_test.drop(columns=['date_received','company_response_to_consumer','clean','state','company_name','tags','product_bins'])
y_test = small_test['company_response_to_consumer']
X_train.head()

Unnamed: 0,lemon,Older American,"Older American, Servicemember",Servicemember,credit_card,credit_report,debt_collection,loans,money_service,mortgage
48045,trying contact grain technology inc debt owed ...,0,0,0,0,0,1,0,0,0
1164880,bought large rug start vacation called filed d...,0,0,0,1,0,0,0,0,0
463976,original complaint went car dealership ohio lo...,0,0,0,0,1,0,0,0,0
490359,synopsis loan demand estate loan originated an...,0,0,0,0,0,0,0,0,1
316427,may concern writing dispute fraudulent charge ...,0,0,0,0,1,0,0,0,0


In [12]:
X_train_cv,X_val_cv,X_test_cv = m.make_cv(X_train, X_val, X_test)
X_train_tf,X_val_tf,X_test_tf = m.make_tfidf(X_train, X_val, X_test)

In [13]:
encoded_train = X_train.iloc[:,1:]
encoded_val = X_val.iloc[:,1:]
encoded_test = X_test.iloc[:,1:]

X_train_cve = encoded_train.merge(X_train_cv,left_index=True, right_index=True)
X_val_cve = encoded_val.merge(X_val_cv,left_index=True, right_index=True)
X_test_cve = encoded_test.merge(X_test_cv,left_index=True, right_index=True)

X_train_tfe = encoded_train.merge(X_train_tf,left_index=True, right_index=True)
X_val_tfe = encoded_val.merge(X_val_tf,left_index=True, right_index=True)
X_test_tfe = encoded_test.merge(X_test_tf,left_index=True, right_index=True)

In [14]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

def xgboost_hyperparam_search(X_train, y_train, X_val, y_val, hyperparams):
    results = []
    
    for params in hyperparams:
        model = xgb.XGBClassifier(**params)
        model.fit(X_train, y_train)
        
        train_pred = model.predict(X_train)
        train_acc = accuracy_score(y_train, train_pred)
        
        val_pred = model.predict(X_val)
        val_acc = accuracy_score(y_val, val_pred)
        
        results.append({'Parameters': params, 'Train Accuracy': train_acc, 'Validation Accuracy': val_acc})
    
    df_results = pd.DataFrame(results)
    df_results = df_results.sort_values(by='Validation Accuracy', ascending=False).head(10)
    
    # Plotting the top 10 models
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, len(df_results) + 1), df_results['Train Accuracy'], label='Train Accuracy')
    plt.plot(range(1, len(df_results) + 1), df_results['Validation Accuracy'], label='Validation Accuracy')
    plt.xlabel('Model')
    plt.ylabel('Accuracy')
    plt.title('Top 10 Models - Train vs Validation Accuracy')
    plt.legend()
    plt.xticks(range(1, len(df_results) + 1))
    plt.show()
    
    return df_results


In [16]:
hyperparams = [
    {'max_depth': 3, 'learning_rate': 0.1, 'n_estimators': 100},
    {'max_depth': 5, 'learning_rate': 0.05, 'n_estimators': 200},
    {'max_depth': 3, 'learning_rate': 0.01, 'n_estimators': 500}]

In [18]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)

In [19]:
xgboost_hyperparam_search(X_train_cve, y_train_encoded, X_val_cve, y_val_encoded, hyperparams)

KeyboardInterrupt: 