# Importing Required Libraries

In [24]:
# Required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer

# Load Data

In [25]:
# Load the data
train_data = pd.read_csv('P1 Data/Consumer_Complaints_train.csv')

# View the first few rows
print(train_data.head())

# Check data types and missing values
print(train_data.info())

# Summary statistics of numerical columns
print(train_data.describe())

# Check for missing values
print(train_data.isnull().sum())

  Date received                  Product                  Sub-product  \
0    2014-05-15              Credit card                          NaN   
1    2014-09-18  Bank account or service  (CD) Certificate of deposit   
2    2014-03-13         Credit reporting                          NaN   
3    2015-07-17              Credit card                          NaN   
4    2014-11-20              Credit card                          NaN   

                                      Issue       Sub-issue  \
0                         Billing statement             NaN   
1  Making/receiving payments, sending money             NaN   
2    Incorrect information on credit report  Account status   
3                         Billing statement             NaN   
4                         Transaction issue             NaN   

                        Consumer complaint narrative  \
0                                                NaN   
1                                                NaN   
2             

# Handling Date Columns
The PDF clearly states that you should not use date columns as-is, but instead create new features from them. Here's how you can proceed:

Do:

Extract the month, year, and week from the date columns.
Consider creating time-based features like whether a complaint was filed in the first or last week of the month.
Don't:

Use raw date columns directly as features.
Here’s how to extract features from date columns:

In [26]:
# Convert date columns to datetime type
train_data['Date received'] = pd.to_datetime(train_data['Date received'])

# Create new features from the 'Date received'
train_data['Year_received'] = train_data['Date received'].dt.year
train_data['Month_received'] = train_data['Date received'].dt.month
train_data['Week_received'] = train_data['Date received'].dt.isocalendar().week

# Drop the original date columns if not needed
train_data.drop(columns=['Date received'], inplace=True)


# Categorical Variables
1. Do:
Encode categorical variables (like Product, Issue, and Company) using LabelEncoder or OneHotEncoder.
2. Don't:
Treat categorical variables as continuous or numeric without encoding.
Let’s identify the categorical variables and encode them:

In [27]:
from sklearn.preprocessing import LabelEncoder

# Example categorical columns to encode
categorical_cols = ['Product', 'Issue', 'Company']

# Label encoding for categorical variables
le = LabelEncoder()
for col in categorical_cols:
    train_data[col] = le.fit_transform(train_data[col].astype(str))

# View encoded data
print(train_data.head())


   Product                  Sub-product  Issue       Sub-issue  \
0        2                          NaN     14             NaN   
1        0  (CD) Certificate of deposit     63             NaN   
2        3                          NaN     52  Account status   
3        2                          NaN     14             NaN   
4        2                          NaN     88             NaN   

                        Consumer complaint narrative  \
0                                                NaN   
1                                                NaN   
2                                                NaN   
3  My credit card statement from US Bank, XXXX. X...   
4                                                NaN   

                            Company public response  Company State ZIP code  \
0                                               NaN     3192    MI    48342   
1                                               NaN     2662    PA    18042   
2                            

# Handling Text Data
Do:

Handle the Consumer complaint narrative column creatively. Consider using techniques like TF-IDF (Term Frequency-Inverse Document Frequency) to convert this text data into meaningful features.
Don't:

Use the raw text data directly without transforming it.
Here’s an example of how to apply TF-IDF:

In [67]:
# Handling missing data (for the 'Consumer complaint narrative' column)
train_data['Consumer complaint narrative'] = train_data['Consumer complaint narrative'].fillna("")
test_data['Consumer complaint narrative'] = test_data['Consumer complaint narrative'].fillna("")

# Handling Missing Values
Do:
Impute missing values if they are few.
Consider dropping columns with too many missing values or creating new features to track the presence of missing data.
Don't:
Arbitrarily remove rows or columns without analyzing the distribution of missing values.
Here’s how you can handle missing data:

In [36]:
# Apply median only to numeric columns
numeric_cols = train_data.select_dtypes(include=['number']).columns
train_data[numeric_cols] = train_data[numeric_cols].fillna(train_data[numeric_cols].median())
# Fill numeric columns with median
train_data[numeric_cols] = train_data[numeric_cols].fillna(train_data[numeric_cols].median())

# Fill non-numeric columns with the most frequent value
categorical_cols = train_data.select_dtypes(include=['object']).columns
train_data[categorical_cols] = train_data[categorical_cols].fillna(train_data[categorical_cols].mode().iloc[0])


# Handling ZIP Codes
Do:
Treat ZIP codes as categorical variables (you can encode them).
Don't:
Use ZIP codes as numeric variables.
If you decide to encode ZIP codes:

In [35]:
# Encode ZIP codes as categorical
train_data['ZIP code'] = train_data['ZIP code'].astype(str)  # Convert to string if needed
train_data['ZIP code'] = le.fit_transform(train_data['ZIP code'])


# EDA Visualization
It’s good practice to visualize key aspects of your data. Here are a few visualizations you can create:

In [38]:
import seaborn as sns
import matplotlib.pyplot as plt

# Step 1: Print available column names to investigate
print("Available columns in the dataset:")
print(train_data.columns)

# Step 2: Strip any leading/trailing spaces from column names (if necessary)
train_data.columns = train_data.columns.str.strip()

# Step 3: Identify the correct column
# Try using 'Consumer disputed', if not found, print a message and list column names
if 'Consumer disputed' in train_data.columns:
    correct_column = 'Consumer disputed'
elif 'Consumer_disputed' in train_data.columns:
    correct_column = 'Consumer_disputed'
else:
    print("Error: The column 'Consumer disputed' or 'Consumer_disputed' was not found in the dataset.")
    print("Please check the available columns listed above and replace 'Consumer disputed' with the correct column name.")
    # Stop the execution if no valid column is found
    raise ValueError("Column 'Consumer disputed' not found. Check column names above.")

# Step 4: Plot the count plot using the identified correct column
sns.countplot(data=train_data, x=correct_column)
plt.title('Distribution of Consumer Disputes')
plt.show()


Available columns in the dataset:
Index(['Product', 'Sub-product', 'Issue', 'Sub-issue',
       'Company public response', 'Company', 'State', 'ZIP code', 'Tags',
       'Consumer consent provided?',
       ...
       'were', 'what', 'when', 'which', 'will', 'with', 'would', 'xx', 'xxxx',
       'you'],
      dtype='object', length=119)
Error: The column 'Consumer disputed' or 'Consumer_disputed' was not found in the dataset.
Please check the available columns listed above and replace 'Consumer disputed' with the correct column name.


ValueError: Column 'Consumer disputed' not found. Check column names above.

In [79]:
# Required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer

# Load training and test data
train_data = pd.read_csv('P1 Data/Consumer_Complaints_train.csv')
test_data = pd.read_csv('P1 Data/Consumer_Complaints_test_share.csv')

# Display the first few rows of the training data
print(train_data.head())

# 1. Feature Engineering
# Handling date columns (example: convert date columns to year, month, and day)
train_data['Date received'] = pd.to_datetime(train_data['Date received'])
train_data['Year_received'] = train_data['Date received'].dt.year
train_data['Month_received'] = train_data['Date received'].dt.month
train_data['Day_received'] = train_data['Date received'].dt.day

test_data['Date received'] = pd.to_datetime(test_data['Date received'])
test_data['Year_received'] = test_data['Date received'].dt.year
test_data['Month_received'] = test_data['Date received'].dt.month
test_data['Day_received'] = test_data['Date received'].dt.day

# Handling categorical variables using LabelEncoder
categorical_cols = ['Product', 'Issue', 'Company']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    train_data[col] = le.fit_transform(train_data[col].astype(str))
    test_data[col] = le.transform(test_data[col].astype(str))  # Use same encoder for test data
    label_encoders[col] = le

# Handling text data using TF-IDF for 'Consumer complaint narrative'
tfidf = TfidfVectorizer(max_features=100)  # Limit features for efficiency
train_text_features = tfidf.fit_transform(train_data['Consumer complaint narrative'].astype(str))
test_text_features = tfidf.transform(test_data['Consumer complaint narrative'].astype(str))

# Convert sparse matrix to dense for concatenation
train_text_features = train_text_features.toarray()
test_text_features = test_text_features.toarray()

# Handling missing data (simple imputation or dropping columns)
train_data.fillna(0, inplace=True)
test_data.fillna(0, inplace=True)

# Dropping irrelevant columns (e.g., 'Consumer ID', 'ZIP code', 'Date sent to company')
train_data.drop(['Consumer ID', 'ZIP code', 'Date sent to company'], axis=1, inplace=True)
test_data.drop(['Consumer ID', 'ZIP code', 'Date sent to company'], axis=1, inplace=True)

# Feature scaling for numeric columns (if needed)
scaler = StandardScaler()
numeric_cols = ['Year_received', 'Month_received', 'Day_received']  # Add more numeric columns if available
train_data[numeric_cols] = scaler.fit_transform(train_data[numeric_cols])
test_data[numeric_cols] = scaler.transform(test_data[numeric_cols])

# Combine text features with other features
train_data = np.hstack((train_data.drop('Consumer disputed?', axis=1).values, train_text_features))
test_data = np.hstack((test_data.values, test_text_features))

# Target variable
y_train = train_data['Consumer disputed?']

# 2. Model Building (RandomForestClassifier)
model = RandomForestClassifier(random_state=42)

# 3. Parameter Tuning (Grid Search)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='roc_auc', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best estimator
best_model = grid_search.best_estimator_

# 4. Predict on the test set
y_pred = best_model.predict_proba(X_test)[:, 1]  # Predict probabilities for AUC calculation

# Create the submission file
submission = pd.DataFrame({'Consumer disputed?': y_pred})
submission.to_csv('submission.csv', index=False)

# 5. Model Evaluation (AUC score on training set as validation)
y_train_pred = best_model.predict_proba(X_train)[:, 1]
train_auc = roc_auc_score(y_train, y_train_pred)
print(f'Training AUC Score: {train_auc}')


  Date received                  Product                  Sub-product  \
0    2014-05-15              Credit card                          NaN   
1    2014-09-18  Bank account or service  (CD) Certificate of deposit   
2    2014-03-13         Credit reporting                          NaN   
3    2015-07-17              Credit card                          NaN   
4    2014-11-20              Credit card                          NaN   

                                      Issue       Sub-issue  \
0                         Billing statement             NaN   
1  Making/receiving payments, sending money             NaN   
2    Incorrect information on credit report  Account status   
3                         Billing statement             NaN   
4                         Transaction issue             NaN   

                        Consumer complaint narrative  \
0                                                NaN   
1                                                NaN   
2             

ValueError: y contains previously unseen labels: 'Kivell, Rayment and Francis, P.C.'

# Load Data and Initial EDA

In [9]:
import pandas as pd

# Load the training and test datasets
train_df = pd.read_csv('D:\\DataScience\\Projects\\Projects\\1 Class Project\\P1 Data\\Consumer_Complaints_train.csv')
test_df = pd.read_csv('D:\\DataScience\\Projects\\Projects\\1 Class Project\\P1 Data\\Consumer_Complaints_test_share.csv')

# Check missing values
missing_values = train_df.isnull().sum().sort_values(ascending=False)
missing_percentage = (missing_values / len(train_df)) * 100
print(missing_percentage[missing_percentage > 0])

# Drop columns with excessive missing values
columns_to_drop = ['Tags', 'Consumer complaint narrative', 'Company public response', 'Consumer consent provided?', 'Sub-issue']
train_df_clean = train_df.drop(columns=columns_to_drop)

# Convert date columns to datetime and engineer features
train_df_clean['Date received'] = pd.to_datetime(train_df_clean['Date received'], errors='coerce')
train_df_clean['Date sent to company'] = pd.to_datetime(train_df_clean['Date sent to company'], errors='coerce')
train_df_clean['received_month'] = train_df_clean['Date received'].dt.month
train_df_clean['received_weekday'] = train_df_clean['Date received'].dt.weekday
train_df_clean['time_diff'] = (train_df_clean['Date sent to company'] - train_df_clean['Date received']).dt.days

# Drop original date columns
train_df_clean = train_df_clean.drop(columns=['Date received', 'Date sent to company'])


Tags                            85.952540
Consumer complaint narrative    84.303783
Company public response         81.106181
Consumer consent provided?      71.680382
Sub-issue                       61.164748
Sub-product                     28.943755
ZIP code                         0.804313
State                            0.802431
dtype: float64


# Handle Missing Values and Encode Categorical Variables

In [13]:
# Fill missing values without using inplace=True to avoid warnings
train_df_clean['Sub-product'] = train_df_clean['Sub-product'].fillna('Unknown')
train_df_clean['State'] = train_df_clean['State'].fillna('Unknown')
train_df_clean['ZIP code'] = train_df_clean['ZIP code'].fillna('Unknown')

# Encoding categorical variables using one-hot encoding for simplicity
train_df_encoded = pd.get_dummies(train_df_clean, 
                                  columns=['Product', 'Sub-product', 'Issue', 'Company', 
                                           'State', 'Submitted via', 'Company response to consumer', 
                                           'Timely response?'], 
                                  drop_first=True)

# Convert target variable to binary (Yes = 1, No = 0)
train_df_encoded['Consumer disputed?'] = train_df_encoded['Consumer disputed?'].apply(lambda x: 1 if x == 'Yes' else 0)

# Check the cleaned and encoded data
train_df_encoded.head()


Unnamed: 0,ZIP code,Consumer disputed?,Complaint ID,received_month,received_weekday,time_diff,Product_Consumer Loan,Product_Credit card,Product_Credit reporting,Product_Debt collection,...,Submitted via_Postal mail,Submitted via_Referral,Submitted via_Web,Company response to consumer_Closed with explanation,Company response to consumer_Closed with monetary relief,Company response to consumer_Closed with non-monetary relief,Company response to consumer_Closed with relief,Company response to consumer_Closed without relief,Company response to consumer_Untimely response,Timely response?_Yes
0,48342,0,856103,5,3,1,False,True,False,False,...,False,False,True,True,False,False,False,False,False,True
1,18042,0,1034666,9,3,6,False,False,False,False,...,False,True,False,False,False,False,False,False,False,True
2,92427,0,756363,3,3,21,False,False,True,False,...,False,True,False,False,False,True,False,False,False,True
3,305XX,0,1474177,7,4,0,False,True,False,False,...,False,False,True,False,True,False,False,False,False,True
4,02127,0,1132572,11,3,8,False,True,False,False,...,False,False,True,True,False,False,False,False,False,True


# Model Building (Logistic Regression)

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

# Check if 'ZIP code' exists before dropping it
if 'ZIP code' in train_df_encoded.columns:
    train_df_encoded = train_df_encoded.drop(columns=['ZIP code'])

# Split data into features and target
X = train_df_encoded.drop(columns=['Consumer disputed?', 'Complaint ID'])
y = train_df_encoded['Consumer disputed?']

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Build a RandomForest model (faster for large datasets)
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

# Predict on validation set
y_pred = model.predict_proba(X_val)[:, 1]

# Calculate AUC score
auc_score = roc_auc_score(y_val, y_pred)
print(f'AUC Score: {auc_score}')


AUC Score: 0.6019561298421904


# Test Predictions and Submission

In [19]:
# Assuming the RandomForestClassifier model is already trained

# Preprocess the test data similarly to the training data
test_df_clean = test_df.drop(columns=columns_to_drop)

# Fill missing values in the test set
test_df_clean['Sub-product'] = test_df_clean['Sub-product'].fillna('Unknown')
test_df_clean['State'] = test_df_clean['State'].fillna('Unknown')
test_df_clean['ZIP code'] = test_df_clean['ZIP code'].fillna('Unknown')

# Convert date columns to datetime
test_df_clean['Date received'] = pd.to_datetime(test_df_clean['Date received'], errors='coerce')
test_df_clean['Date sent to company'] = pd.to_datetime(test_df_clean['Date sent to company'], errors='coerce')

# Feature engineering for test data
test_df_clean['received_month'] = test_df_clean['Date received'].dt.month
test_df_clean['received_weekday'] = test_df_clean['Date received'].dt.weekday
test_df_clean['time_diff'] = (test_df_clean['Date sent to company'] - test_df_clean['Date received']).dt.days

# Drop original date columns
test_df_clean = test_df_clean.drop(columns=['Date received', 'Date sent to company'])

# Encode categorical variables in the test set
test_df_encoded = pd.get_dummies(test_df_clean, 
                                  columns=['Product', 'Sub-product', 'Issue', 'Company', 
                                           'State', 'Submitted via', 'Company response to consumer', 
                                           'Timely response?'], 
                                  drop_first=True)

# Ensure test and train columns match
test_df_encoded = test_df_encoded.reindex(columns=X_train.columns, fill_value=0)

# Make predictions on the test set using the trained RandomForest model
test_preds = model.predict_proba(test_df_encoded)[:, 1]

# Create submission dataframe
submission_df = pd.DataFrame({'Complaint ID': test_df['Complaint ID'], 'Consumer disputed': test_preds})

# Specify the path where you want to save the submission file
submission_path = r'C:\Users\D MADHAN MOHAN\Documents\submission.csv'  # Update the path as needed

# Save the submission to a CSV file
submission_df.to_csv(submission_path, index=False)
print(f"Submission file saved as '{submission_path}'")


Submission file saved as 'C:\Users\D MADHAN MOHAN\Documents\submission.csv'
