# Consumer Complaint Database

https://catalog.data.gov/dataset/consumer-complaint-database

# Predicting the company response to consumer complaints

# import libraries

In [1]:
import time
from IPython.display import display
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport


# pd.options.display.max_columns = None

# keep only 100000 records

In [None]:
with open('dataset.csv', 'r', encoding='utf-8') as f:
    lines = f.readlines()

with open('dataset.csv', 'w', encoding='utf-8') as f:
    f.writelines(lines[:100000])

del lines
del f

# print head of the data

In [2]:
df = pd.read_csv('dataset.csv')

# Generate a report of the data

In [None]:
profile = ProfileReport(df, title='Pandas Profiling Report', explorative=True)
profile.to_notebook_iframe()

# print info of the data

In [3]:
display(df.info())
display(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 944235 entries, 0 to 944234
Data columns (total 28 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   DR_NO           944235 non-null  int64  
 1   Date Rptd       944235 non-null  object 
 2   DATE OCC        944235 non-null  object 
 3   TIME OCC        944235 non-null  int64  
 4   AREA            944235 non-null  int64  
 5   AREA NAME       944235 non-null  object 
 6   Rpt Dist No     944235 non-null  int64  
 7   Part 1-2        944235 non-null  int64  
 8   Crm Cd          944235 non-null  int64  
 9   Crm Cd Desc     944235 non-null  object 
 10  Mocodes         811136 non-null  object 
 11  Vict Age        944235 non-null  int64  
 12  Vict Sex        817640 non-null  object 
 13  Vict Descent    817630 non-null  object 
 14  Premis Cd       944225 non-null  float64
 15  Premis Desc     943668 non-null  object 
 16  Weapon Used Cd  324477 non-null  float64
 17  Weapon Des

None

Unnamed: 0,DR_NO,Date Rptd,DATE OCC,TIME OCC,AREA,AREA NAME,Rpt Dist No,Part 1-2,Crm Cd,Crm Cd Desc,...,Status,Status Desc,Crm Cd 1,Crm Cd 2,Crm Cd 3,Crm Cd 4,LOCATION,Cross Street,LAT,LON
0,190326475,03/01/2020 12:00:00 AM,03/01/2020 12:00:00 AM,2130,7,Wilshire,784,1,510,VEHICLE - STOLEN,...,AA,Adult Arrest,510.0,998.0,,,1900 S LONGWOOD AV,,34.0375,-118.3506
1,200106753,02/09/2020 12:00:00 AM,02/08/2020 12:00:00 AM,1800,1,Central,182,1,330,BURGLARY FROM VEHICLE,...,IC,Invest Cont,330.0,998.0,,,1000 S FLOWER ST,,34.0444,-118.2628
2,200320258,11/11/2020 12:00:00 AM,11/04/2020 12:00:00 AM,1700,3,Southwest,356,1,480,BIKE - STOLEN,...,IC,Invest Cont,480.0,,,,1400 W 37TH ST,,34.021,-118.3002
3,200907217,05/10/2023 12:00:00 AM,03/10/2020 12:00:00 AM,2037,9,Van Nuys,964,1,343,SHOPLIFTING-GRAND THEFT ($950.01 & OVER),...,IC,Invest Cont,343.0,,,,14000 RIVERSIDE DR,,34.1576,-118.4387
4,220614831,08/18/2022 12:00:00 AM,08/17/2020 12:00:00 AM,1200,6,Hollywood,666,2,354,THEFT OF IDENTITY,...,IC,Invest Cont,354.0,,,,1900 TRANSIENT,,34.0944,-118.3277


# how many records does the data have?

In [6]:
display("Number of records: ", df.shape[0])

'Number of records: '

944235

# how many features does the data have?

In my project, I will consider all the features except the 'Company response
to consumer' as the features.

In [7]:
display("Number of features: ", df.shape[1] - 1)

'Number of features: '

18

# How many many different classes exist in the dataset?

In [8]:
display("Number of classes: ", len(
    df['Company response to consumer'].unique()))
display("Number of examples per class:\n",
        df['Company response to consumer'].value_counts())

'Number of classes: '

139

'Number of examples per class:\n'

510    102036
624     74509
330     58311
354     58240
310     57497
        ...  
432         6
904         5
445         4
906         4
926         1
Name: Crm Cd, Length: 139, dtype: int64

# show how many NULL values exist in the dataset

In [9]:
display("Number of NULL values per feature:\n", df.isnull().sum())

'Number of NULL values per feature:\n'

Date Rptd              0
DATE OCC               0
TIME OCC               0
AREA                   0
Rpt Dist No            0
Part 1-2               0
Crm Cd                 0
Vict Age               0
Vict Sex          126595
Vict Descent      126605
Premis Cd             10
Weapon Used Cd    619758
Status Desc            0
Crm Cd 1              11
Crm Cd 2          875977
Crm Cd 3          941954
Crm Cd 4          944171
LOCATION               0
Cross Street      796643
dtype: int64

# Which featiures are not numerical?

In [10]:
display("Non numerical features:\n",
        df.select_dtypes(exclude=[np.number]).columns)

'Non numerical features:\n'

Index(['Date Rptd', 'DATE OCC', 'Vict Sex', 'Vict Descent', 'Status Desc',
       'LOCATION', 'Cross Street'],
      dtype='object')

# Find the best correlated Features in the Dataset

In [4]:
df_tmp = df.copy()
label_encoder = LabelEncoder()
for column in df_tmp.select_dtypes(include=['object']).columns:
    df_tmp[column] = label_encoder.fit_transform(df_tmp[column].astype(str))
df_tmp = (df_tmp - df_tmp.mean()) / df_tmp.std()
correlation = df_tmp.corr()

# Find the best-correlated features in pairs
for column in correlation.columns:
    display(correlation[column].sort_values(ascending=False).head(2))

DR_NO       1.000000
Crm Cd 4    0.152327
Name: DR_NO, dtype: float64

Date Rptd    1.000000
DATE OCC     0.917473
Name: Date Rptd, dtype: float64

DATE OCC     1.000000
Date Rptd    0.917473
Name: DATE OCC, dtype: float64

TIME OCC    1.000000
Vict Sex    0.045929
Name: TIME OCC, dtype: float64

AREA           1.000000
Rpt Dist No    0.999047
Name: AREA, dtype: float64

AREA NAME       1.000000
Vict Descent    0.083812
Name: AREA NAME, dtype: float64

Rpt Dist No    1.000000
AREA           0.999047
Name: Rpt Dist No, dtype: float64

Part 1-2    1.000000
Crm Cd 1    0.702323
Name: Part 1-2, dtype: float64

Crm Cd      1.000000
Crm Cd 1    0.999304
Name: Crm Cd, dtype: float64

Crm Cd Desc    1.000000
Weapon Desc    0.389155
Name: Crm Cd Desc, dtype: float64

Mocodes     1.000000
Vict Sex    0.299887
Name: Mocodes, dtype: float64

Vict Age    1.000000
Part 1-2    0.200301
Name: Vict Age, dtype: float64

Vict Sex        1.000000
Vict Descent    0.556223
Name: Vict Sex, dtype: float64

Vict Descent    1.000000
Vict Sex        0.556223
Name: Vict Descent, dtype: float64

Premis Cd    1.000000
Part 1-2     0.270579
Name: Premis Cd, dtype: float64

Premis Desc    1.00000
Mocodes        0.14874
Name: Premis Desc, dtype: float64

Weapon Used Cd    1.000000
Weapon Desc       0.678767
Name: Weapon Used Cd, dtype: float64

Weapon Desc       1.000000
Weapon Used Cd    0.678767
Name: Weapon Desc, dtype: float64

Status         1.000000
Status Desc    0.988045
Name: Status, dtype: float64

Status Desc    1.000000
Status         0.988045
Name: Status Desc, dtype: float64

Crm Cd 1    1.000000
Crm Cd      0.999304
Name: Crm Cd 1, dtype: float64

Crm Cd 2    1.000000
Crm Cd 3    0.292795
Name: Crm Cd 2, dtype: float64

Crm Cd 3    1.000000
Crm Cd 2    0.292795
Name: Crm Cd 3, dtype: float64

Crm Cd 4    1.000000
LOCATION    0.184528
Name: Crm Cd 4, dtype: float64

LOCATION    1.000000
Crm Cd 4    0.184528
Name: LOCATION, dtype: float64

Cross Street    1.000000
Premis Cd       0.251012
Name: Cross Street, dtype: float64

LAT         1.000000
Crm Cd 4    0.100286
Name: LAT, dtype: float64

LON         1.000000
Crm Cd 1    0.039721
Name: LON, dtype: float64

# Remove the features that have a high correlation with each other

In [5]:
# correlated with 'Date received'
df = df.drop(columns=['Date sent to company'])

# Remove the `Complaint ID` feature, because it's not useful for the
classification, Since all the values are unique.

In [None]:
df = df.drop(columns=['Complaint ID',
             'Consumer complaint narrative', 'Company public response'])

# In `ZIP code` replace the `XXXXX` with the `np.nan` and convert the feature
to float

In [None]:
df['ZIP code'] = df['ZIP code'].replace('XXXXX', np.nan)

# Transform the `Date received` to three features of year, month, and day

In [None]:
df['Date received'] = pd.to_datetime(df['Date received'])

df['YEAR RECEIVED'] = df['Date received'].dt.year
df['MONTH RECEIVED'] = df['Date received'].dt.month
df['DAY RECEIVED'] = df['Date received'].dt.day

df = df.drop(columns=['Date received'])

# Remove the rows that have NULL values in the `Issue` and `Company response
to consumer`

In [None]:
df.dropna(subset=['Issue', 'Company response to consumer'], inplace=True)

# transform the following features with LabelEncoder

'Product', 'Sub-product', 'Issue', 'Sub-issue', 'Company', 'State',
'ZIP code', 'Company response to consumer', 'Timely response?',
'Consumer disputed?'

In [None]:
label_encoder = LabelEncoder()

for col in ['Product', 'Sub-product', 'Issue', 'Sub-issue', 'Company', 'State',
            'ZIP code', 'Company response to consumer', 'Timely response?',
            'Consumer disputed?']:
    df[col] = label_encoder.fit_transform(df[col].astype(str))

# transform the following features with OneHotEncoder

'Tags', 'Consumer consent provided?', 'Submitted via',

In [None]:
one_hot_encoder = OneHotEncoder()
transformed_data = one_hot_encoder.fit_transform(df[['Tags']]).toarray()
column_names = one_hot_encoder.categories_[0]
column_names = np.where(pd.isna(column_names), 'tags_unknown', column_names)
df = pd.concat(
    [df, pd.DataFrame(transformed_data, columns=column_names)], axis=1)

transformed_data = one_hot_encoder.fit_transform(
    df[['Consumer consent provided?']]).toarray()
column_names = one_hot_encoder.categories_[0]
column_names = np.where(pd.isna(column_names),
                        'consumer_consent_unknown', column_names)
df = pd.concat(
    [df, pd.DataFrame(transformed_data, columns=column_names)], axis=1)

transformed_data = one_hot_encoder.fit_transform(
    df[['Submitted via']]).toarray()
column_names = one_hot_encoder.categories_[0]
column_names = np.where(pd.isna(column_names),
                        'submitted_via_unknown', column_names)
df = pd.concat(
    [df, pd.DataFrame(transformed_data, columns=column_names)], axis=1)

df = df.drop(
    columns=['tags_unknown', 'consumer_consent_unknown', 'submitted_via_unknown'])

df = df.drop(columns=['Tags', 'Consumer consent provided?', 'Submitted via'])

# drop the rest of the rows that have NULL values

In [20]:
df.dropna(inplace=True)

# Remove the data after the row 20000

In [None]:
df = df.iloc[:20000]

# simple impute

In [21]:
imputer = SimpleImputer(strategy='mean')
df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# display the head and info of the data

In [22]:
display(df.info())
display(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 710090 entries, 0 to 710089
Data columns (total 28 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   TIME OCC        710090 non-null  float64
 1   AREA            710090 non-null  float64
 2   Rpt Dist No     710090 non-null  float64
 3   Part 1-2        710090 non-null  float64
 4   Crm Cd          710090 non-null  float64
 5   Vict Age        710090 non-null  float64
 6   Vict Sex        710090 non-null  float64
 7   Vict Descent    710090 non-null  float64
 8   Premis Cd       710090 non-null  float64
 9   Weapon Used Cd  710090 non-null  float64
 10  Crm Cd 1        710090 non-null  float64
 11  Crm Cd 2        710090 non-null  float64
 12  Crm Cd 3        710090 non-null  float64
 13  Crm Cd 4        710090 non-null  float64
 14  LOCATION        710090 non-null  float64
 15  Cross Street    710090 non-null  float64
 16  YEAR OCC        710090 non-null  float64
 17  MONTH OCC 

None

Unnamed: 0,TIME OCC,AREA,Rpt Dist No,Part 1-2,Crm Cd,Vict Age,Vict Sex,Vict Descent,Premis Cd,Weapon Used Cd,...,DAY OCC,YEAR RPTD,MONTH RPTD,DAY RPTD,Adult Arrest,Adult Other,Invest Cont,Juv Arrest,Juv Other,UNK
0,2130.0,7.0,784.0,1.0,510.0,0.0,3.0,12.0,101.0,0.0,...,1.0,2020.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1800.0,1.0,182.0,1.0,330.0,47.0,3.0,12.0,128.0,0.0,...,8.0,2020.0,2.0,9.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1700.0,3.0,356.0,1.0,480.0,19.0,4.0,18.0,502.0,0.0,...,4.0,2020.0,11.0,11.0,0.0,0.0,1.0,0.0,0.0,0.0
3,2037.0,9.0,964.0,1.0,343.0,19.0,3.0,12.0,405.0,0.0,...,10.0,2023.0,5.0,10.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1200.0,6.0,666.0,2.0,354.0,28.0,3.0,7.0,102.0,0.0,...,17.0,2022.0,8.0,18.0,0.0,0.0,1.0,0.0,0.0,0.0


# normalize the data

In [23]:
scaler = StandardScaler()
df[df.columns] = scaler.fit_transform(df[df.columns])

# display the head and info of the data

In [24]:
display(df.info())
display(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 710090 entries, 0 to 710089
Data columns (total 28 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   TIME OCC        710090 non-null  float64
 1   AREA            710090 non-null  float64
 2   Rpt Dist No     710090 non-null  float64
 3   Part 1-2        710090 non-null  float64
 4   Crm Cd          710090 non-null  float64
 5   Vict Age        710090 non-null  float64
 6   Vict Sex        710090 non-null  float64
 7   Vict Descent    710090 non-null  float64
 8   Premis Cd       710090 non-null  float64
 9   Weapon Used Cd  710090 non-null  float64
 10  Crm Cd 1        710090 non-null  float64
 11  Crm Cd 2        710090 non-null  float64
 12  Crm Cd 3        710090 non-null  float64
 13  Crm Cd 4        710090 non-null  float64
 14  LOCATION        710090 non-null  float64
 15  Cross Street    710090 non-null  float64
 16  YEAR OCC        710090 non-null  float64
 17  MONTH OCC 

None

Unnamed: 0,TIME OCC,AREA,Rpt Dist No,Part 1-2,Crm Cd,Vict Age,Vict Sex,Vict Descent,Premis Cd,Weapon Used Cd,...,DAY OCC,YEAR RPTD,MONTH RPTD,DAY RPTD,Adult Arrest,Adult Other,Invest Cont,Juv Arrest,Juv Other,UNK
0,1.239951,-0.596604,-0.53513,-0.948521,0.03939,-1.730151,0.669496,0.322063,-1.077201,-0.745585,...,-1.592666,-1.376571,-1.024059,-1.647819,3.081694,-0.374523,-1.8612,-0.060258,-0.045948,-0.002654
1,0.7345,-1.577536,-1.519459,-0.948521,-0.770294,0.642735,0.669496,0.322063,-0.952161,-0.745585,...,-0.813416,-1.376571,-1.314926,-0.742875,-0.324497,-0.374523,0.537288,-0.060258,-0.045948,-0.002654
2,0.581334,-1.250559,-1.234952,-0.948521,-0.095557,-0.770899,1.56755,1.333833,0.779873,-0.745585,...,-1.258702,-1.376571,1.302878,-0.516639,-0.324497,-0.374523,0.537288,-0.060258,-0.045948,-0.002654
3,1.097505,-0.269627,-0.240812,-0.948521,-0.711817,-0.770899,0.669496,0.322063,0.330656,-0.745585,...,-0.590774,1.374355,-0.442325,-0.629757,-0.324497,-0.374523,0.537288,-0.060258,-0.045948,-0.002654
4,-0.1845,-0.760093,-0.728071,1.054272,-0.662336,-0.316516,0.669496,-0.521079,-1.07257,-0.745585,...,0.188475,0.45738,0.430277,0.275186,-0.324497,-0.374523,0.537288,-0.060258,-0.045948,-0.002654


# train data with 5 different classification models

- KNeighborsClassifier
- DecisionTreeClassifier
- RandomForestClassifier
- AdaBoostClassifier
- Naive Bayes

I will save the accuracy and the time it took to train the model for each
model in a dictionary.

# split the data to X and y

In [25]:
X = df.drop(columns=['Company response to consumer'])
y = df['Company response to consumer'].astype(int)  # Ensure y is categorical

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# KNeighborsClassifier

In [26]:
knn = KNeighborsClassifier()
start_time = time.time()
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
end_time = time.time()
knn_accuracy = accuracy_score(y_test, knn_pred)

# display(f"KNeighborsClassifier Accuracy: {knn_accuracy}")
# display(f"Training Time: {end_time - start_time} seconds")

models_info_dict = {}

models_info_dict['KNeighborsClassifier'] = {
    'accuracy': knn_accuracy, 'time': end_time - start_time
}

# DecisionTreeClassifier

In [27]:
dt = DecisionTreeClassifier()
start_time = time.time()
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
end_time = time.time()
dt_accuracy = accuracy_score(y_test, dt_pred)

# display(f"DecisionTreeClassifier Accuracy: {dt_accuracy}")
# display(f"Training Time: {end_time - start_time} seconds")

models_info_dict['DecisionTreeClassifier'] = {
    'accuracy': dt_accuracy, 'time': end_time - start_time
}

# RandomForestClassifier

In [28]:
rf = RandomForestClassifier()
start_time = time.time()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
end_time = time.time()
rf_accuracy = accuracy_score(y_test, rf_pred)

# display(f"RandomForestClassifier Accuracy: {rf_accuracy}")
# display(f"Training Time: {end_time - start_time} seconds")

models_info_dict['RandomForestClassifier'] = {
    'accuracy': rf_accuracy, 'time': end_time - start_time
}

# AdaBoostClassifier

In [29]:
ada = AdaBoostClassifier()
start_time = time.time()
ada.fit(X_train, y_train)
ada_pred = ada.predict(X_test)
end_time = time.time()
ada_accuracy = accuracy_score(y_test, ada_pred)

# display(f"AdaBoostClassifier Accuracy: {ada_accuracy}")
# display(f"Training Time: {end_time - start_time} seconds")

models_info_dict['AdaBoostClassifier'] = {
    'accuracy': ada_accuracy, 'time': end_time - start_time
}



# Naive Bayes

In [30]:
nb = LogisticRegression()
start_time = time.time()
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_test)
end_time = time.time()
nb_accuracy = accuracy_score(y_test, nb_pred)

display(f"Naive Bayes Accuracy: {nb_accuracy}")
display(f"Training Time: {end_time - start_time} seconds")

models_info_dict['Naive Bayes'] = {
    'accuracy': nb_accuracy, 'time': end_time - start_time
}

'Naive Bayes Accuracy: 0.7827247250348547'

'Training Time: 7.711880922317505 seconds'

# display the models info

In [31]:
display("With random split we reached the following results:")
display(models_info_dict)

'With random split we reached the following results:'

{'KNeighborsClassifier': {'accuracy': 0.8019546818008985,
  'time': 100.21694040298462},
 'DecisionTreeClassifier': {'accuracy': 0.8351546987001648,
  'time': 8.404108762741089},
 'RandomForestClassifier': {'accuracy': 0.8937529045613937,
  'time': 147.95581793785095},
 'AdaBoostClassifier': {'accuracy': 0.686574941204636,
  'time': 44.28265070915222},
 'Naive Bayes': {'accuracy': 0.7827247250348547, 'time': 7.711880922317505}}

# 5-fold approach to measure the performance of the system

In [32]:
# KFold
models_info_dict = {}

kf = KFold(n_splits=5)
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
ada = AdaBoostClassifier()
nb = LogisticRegression()

t1 = time.time()
knn_scores = cross_val_score(knn, X, y, cv=kf)
t2 = time.time()
dt_scores = cross_val_score(dt, X, y, cv=kf)
t3 = time.time()
rf_scores = cross_val_score(rf, X, y, cv=kf)
t4 = time.time()
ada_scores = cross_val_score(ada, X, y, cv=kf)
t5 = time.time()
nb_scores = cross_val_score(nb, X, y, cv=kf)
t6 = time.time()

display("With 5-fold approach we reached the following results:")

models_info_dict['KNeighborsClassifier'] = {
    'accuracy': knn_scores.mean(), 'time': t2 - t1
}
models_info_dict['DecisionTreeClassifier'] = {
    'accuracy': dt_scores.mean(), 'time': t3 - t2
}
models_info_dict['RandomForestClassifier'] = {
    'accuracy': rf_scores.mean(), 'time': t4 - t3
}
models_info_dict['AdaBoostClassifier'] = {
    'accuracy': ada_scores.mean(), 'time': t5 - t4
}
models_info_dict['Naive Bayes'] = {
    'accuracy': nb_scores.mean(), 'time': t6 - t5
}

display(models_info_dict)



'With 5-fold approach we reached the following results:'

{'KNeighborsClassifier': {'accuracy': 0.799645115407906,
  'time': 521.1111586093903},
 'DecisionTreeClassifier': {'accuracy': 0.8266036699573294,
  'time': 39.95061659812927},
 'RandomForestClassifier': {'accuracy': 0.8903378444985848,
  'time': 726.5054469108582},
 'AdaBoostClassifier': {'accuracy': 0.6853117210494444,
  'time': 210.71078276634216},
 'Naive Bayes': {'accuracy': 0.7797589038009267, 'time': 35.28852081298828}}

# 10 best features from the dataset

In [33]:
best_features = SelectKBest(score_func=f_classif, k=10)
fit = best_features.fit(X, y)
display("10 best features from the dataset:", X.columns[fit.get_support()])
df_scores = pd.DataFrame(fit.scores_)
df_columns = pd.DataFrame(X.columns)
feature_scores = pd.concat([df_columns, df_scores], axis=1)
feature_scores.columns = ['Feature', 'Score']
display(feature_scores.nlargest(10, 'Score'))

'10 best features from the dataset:'

Index(['Part 1-2', 'Vict Age', 'Vict Sex', 'Vict Descent', 'Premis Cd',
       'Weapon Used Cd', 'Crm Cd 2', 'Crm Cd 3', 'LOCATION', 'Cross Street'],
      dtype='object')

Unnamed: 0,Feature,Score
3,Part 1-2,107798.816236
8,Weapon Used Cd,23854.820186
10,Crm Cd 2,11177.625982
14,Cross Street,8777.316598
7,Premis Cd,5616.897241
6,Vict Descent,4225.36811
13,LOCATION,2937.982336
4,Vict Age,889.026909
11,Crm Cd 3,836.662647
5,Vict Sex,479.789881
