In [1]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import numpy as np
from pprint import pprint as pp
import csv
from pathlib import Path
import seaborn as sns
from itertools import product
import string

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.pipeline import Pipeline 

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import r2_score, classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve, precision_recall_curve, average_precision_score
from sklearn.metrics import homogeneity_score, silhouette_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import MiniBatchKMeans, DBSCAN

import gensim
from gensim import corpora

In [3]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek

In [4]:
pd.set_option('display.max_columns', None)

### Retrieving the data

In [5]:
from urllib.request import urlretrieve
urlretrieve('http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data', 'german.data')

('german.data', <http.client.HTTPMessage at 0x1dc54666c90>)

In [6]:
# Retrieving the dataset

import pandas as pd
df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data', 
                        delimiter=' ',header=None)

In [7]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,4,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,2,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,3,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,4,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,4,A124,53,A143,A153,2,A173,2,A191,A201,2


## German Credit Dataset With Header Names

### Data Contains Encoded values (e.g. A11, A121) and numerical values for the rest [DF1]

In [8]:
df1 = df.copy()

In [9]:
df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,4,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,2,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,3,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,4,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,4,A124,53,A143,A153,2,A173,2,A191,A201,2


In [10]:
df1.columns=['account_balance','duration','credit_history','purpose',
                   'credit_amount','savings_bond_value','employed_since',
                   'intallment_rate','sex_marital','guarantor','residence_since',
                   'property','age','other_installment_plans','type_of_housing',
                   'nr_of_existing_credits','job','number_of_dependents','telephone',
                   'foreign','target']

In [11]:
df1.head()

Unnamed: 0,account_balance,duration,credit_history,purpose,credit_amount,savings_bond_value,employed_since,intallment_rate,sex_marital,guarantor,residence_since,property,age,other_installment_plans,type_of_housing,nr_of_existing_credits,job,number_of_dependents,telephone,foreign,target
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,4,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,2,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,3,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,4,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,4,A124,53,A143,A153,2,A173,2,A191,A201,2


### Data Contains Descriptions of the Encodings and numerical values for the rest [DF2]

In [12]:
df2 = df1.copy()

In [13]:
df2.head()

Unnamed: 0,account_balance,duration,credit_history,purpose,credit_amount,savings_bond_value,employed_since,intallment_rate,sex_marital,guarantor,residence_since,property,age,other_installment_plans,type_of_housing,nr_of_existing_credits,job,number_of_dependents,telephone,foreign,target
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,4,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,2,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,3,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,4,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,4,A124,53,A143,A153,2,A173,2,A191,A201,2


In [14]:
#replacing coded values with their description

df2 = df2.replace(['A11','A12','A13','A14'],
                              ['< 0','0-200 DM','>=200 DM','no checking account'])

df2 = df2.replace(['A30','A31','A32','A33','A34'],
                              ['no credits taken/all credits paid back duly','all credits at this bank paid back duly','existing credits paid back duly till now','delay in paying off in the past','critical account/other credits existing (not at this bank)'])                           
                              
df2 = df2.replace(['A40','A41','A42','A43','A44','A45','A46','A47','A48','A49','A410'],
                              ['car(new)','car(used)','furniture/equipment','radio/television','domestic appliances','repairs','education','vacation','retraining','business','others'])

df2 = df2.replace(['A61','A62','A63','A64', 'A65'],
                              ['< 100 DM','100-500 DM','500-1000 DM','>=1000 DM', 'unknown/no savings account'])

df2 = df2.replace(['A71','A72','A73','A74','A75'],
                              ['unemployed','<1 year','1-4 years','4-7 years','>=7 years'])

df2 = df2.replace(['A91','A92','A93','A94','A95'],
                              ['male;divorced/separated','female;divorced/separated/married','male;single','male;married/widowed','female;single'])

df2 = df2.replace(['A101','A102','A103'],
                              ['bank','stores','none'])

df2 = df2.replace(['A121','A122','A123','A124'],
                              ['real estate','building society savings agreement/life insurance','car or other','unknown / no property'])

df2 = df2.replace(['A141','A142','A143'],
                              ['bank','stores','none'])

df2 = df2.replace(['A151','A152','A153'],
                              ['rent','own','for free'])

df2 = df2.replace(['A171','A172','A173','A174'],
                              ['unemployed/unskilled;non-resident','unskilled;resident','skilled employee;official','management/self-employed/highly qualified employee/officer'])

df2 = df2.replace(['A191','A192'],
                              ['No','Yes'])

df2 = df2.replace(['A201','A202'],
                              ['Yes','No'])

In [15]:
df2.head()

Unnamed: 0,account_balance,duration,credit_history,purpose,credit_amount,savings_bond_value,employed_since,intallment_rate,sex_marital,guarantor,residence_since,property,age,other_installment_plans,type_of_housing,nr_of_existing_credits,job,number_of_dependents,telephone,foreign,target
0,< 0,6,critical account/other credits existing (not a...,radio/television,1169,unknown/no savings account,>=7 years,4,male;single,bank,4,real estate,67,none,own,2,skilled employee;official,1,Yes,Yes,1
1,0-200 DM,48,existing credits paid back duly till now,radio/television,5951,< 100 DM,1-4 years,2,female;divorced/separated/married,bank,2,real estate,22,none,own,1,skilled employee;official,1,No,Yes,2
2,no checking account,12,critical account/other credits existing (not a...,education,2096,< 100 DM,4-7 years,2,male;single,bank,3,real estate,49,none,own,1,unskilled;resident,2,No,Yes,1
3,< 0,42,existing credits paid back duly till now,furniture/equipment,7882,< 100 DM,4-7 years,2,male;single,none,4,building society savings agreement/life insurance,45,none,for free,1,skilled employee;official,2,No,Yes,1
4,< 0,24,delay in paying off in the past,car(new),4870,< 100 DM,1-4 years,3,male;single,bank,4,unknown / no property,53,none,for free,2,skilled employee;official,2,No,Yes,2


In [98]:
# Assuming df is your DataFrame
df2.to_csv('df2.csv')

#### Data Contains Descriptions of the Encodings and numerical values categorized based on quantiles [DF2b]

In [16]:
df2b = df2.copy()

In [17]:
df2b['credit_amount_quantile'] = pd.qcut(df2b['credit_amount'], q=10, labels=False, duplicates='drop')

# Mapping quantile groups to custom names
quantile_names = {i: f'A{i+50}' for i in range(df2b['credit_amount_quantile'].nunique())}
df2b['credit_amount'] = df2b['credit_amount_quantile'].map(quantile_names)

df2b.drop(columns='credit_amount_quantile', inplace=True)

In [18]:
# Define mapping dictionary
mapping_dict = {0:'A80', 1: 'A81', 2: 'A82', 3: 'A83', 4: 'A84', 5: 'A85', 6: 'A86', 7: 'A87'}

# Apply mapping to the column
df2b['intallment_rate'] = df2b['intallment_rate'].map(mapping_dict)

In [19]:
# Grouping the 'duration' column into 10% quantiles
df2b['duration_quantile'] = pd.qcut(df2b['duration'], q=10, labels=False, duplicates='drop')

# Mapping quantile groups to custom names
quantile_names = {i: f'A{i+20}' for i in range(df2b['duration_quantile'].nunique())}
df2b['duration'] = df2b['duration_quantile'].map(quantile_names)

df2b.drop(columns='duration_quantile', inplace=True)

In [20]:
df2b['age_quantile'] = pd.qcut(df2b['age'], q=10, labels=False, duplicates='drop')

# Mapping quantile groups to custom names
quantile_names = {i: f'A{i+130}' for i in range(df2b['age_quantile'].nunique())}
df2b['age'] = df2b['age_quantile'].map(quantile_names)

df2b.drop(columns='age_quantile', inplace=True)

In [21]:
# Define mapping dictionary
mapping_dict = {0:'A110', 1: 'A111', 2: 'A112', 3: 'A113', 4: 'A114', 5: 'A115', 6: 'A116', 7: 'A117'}

# Apply mapping to the column
df2b['residence_since'] = df2b['residence_since'].map(mapping_dict)

In [22]:
# Define mapping dictionary
mapping_dict = {0:'A160', 1: 'A161', 2: 'A162', 3: 'A163', 4: 'A164', 5: 'A165', 6: 'A166', 7: 'A167'}

# Apply mapping to the column
df2b['nr_of_existing_credits'] = df2b['nr_of_existing_credits'].map(mapping_dict)

In [23]:
# Define mapping dictionary
mapping_dict = {0:'A180', 1: 'A181', 2: 'A182', 3: 'A183', 4: 'A184', 5: 'A185', 6: 'A186', 7: 'A187'}

# Apply mapping to the column
df2b['number_of_dependents'] = df2b['number_of_dependents'].map(mapping_dict)

In [24]:
df2b.head()

Unnamed: 0,account_balance,duration,credit_history,purpose,credit_amount,savings_bond_value,employed_since,intallment_rate,sex_marital,guarantor,residence_since,property,age,other_installment_plans,type_of_housing,nr_of_existing_credits,job,number_of_dependents,telephone,foreign,target
0,< 0,A20,critical account/other credits existing (not a...,radio/television,A51,unknown/no savings account,>=7 years,A84,male;single,bank,A114,real estate,A139,none,own,A162,skilled employee;official,A181,Yes,Yes,1
1,0-200 DM,A27,existing credits paid back duly till now,radio/television,A58,< 100 DM,1-4 years,A82,female;divorced/separated/married,bank,A112,real estate,A130,none,own,A161,skilled employee;official,A181,No,Yes,2
2,no checking account,A21,critical account/other credits existing (not a...,education,A54,< 100 DM,4-7 years,A82,male;single,bank,A113,real estate,A138,none,own,A161,unskilled;resident,A182,No,Yes,1
3,< 0,A27,existing credits paid back duly till now,furniture/equipment,A59,< 100 DM,4-7 years,A82,male;single,none,A114,building society savings agreement/life insurance,A137,none,for free,A161,skilled employee;official,A182,No,Yes,1
4,< 0,A24,delay in paying off in the past,car(new),A58,< 100 DM,1-4 years,A83,male;single,bank,A114,unknown / no property,A139,none,for free,A162,skilled employee;official,A182,No,Yes,2


### Data Contains values of Encodings (e.g. A11) and numerical values are categorized into groups based on quantiles [DF3]

In [25]:
df3 = df1.copy()

In [26]:
df3.head()

Unnamed: 0,account_balance,duration,credit_history,purpose,credit_amount,savings_bond_value,employed_since,intallment_rate,sex_marital,guarantor,residence_since,property,age,other_installment_plans,type_of_housing,nr_of_existing_credits,job,number_of_dependents,telephone,foreign,target
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,4,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,2,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,3,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,4,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,4,A124,53,A143,A153,2,A173,2,A191,A201,2


In [27]:
df3['credit_amount_quantile'] = pd.qcut(df3['credit_amount'], q=10, labels=False, duplicates='drop')

# Mapping quantile groups to custom names
quantile_names = {i: f'A{i+50}' for i in range(df3['credit_amount_quantile'].nunique())}
df3['credit_amount_quantile'] = df3['credit_amount_quantile'].map(quantile_names)

In [28]:
# Define mapping dictionary
mapping_dict = {0:'A80', 1: 'A81', 2: 'A82', 3: 'A83', 4: 'A84', 5: 'A85', 6: 'A86', 7: 'A87'}

# Apply mapping to the column
df3['installment_rate_coded'] = df3['intallment_rate'].map(mapping_dict)

In [29]:
# Grouping the 'duration' column into 10% quantiles
df3['duration_quantile'] = pd.qcut(df3['duration'], q=10, labels=False, duplicates='drop')

# Mapping quantile groups to custom names
quantile_names = {i: f'A{i+20}' for i in range(df3['duration_quantile'].nunique())}
df3['duration_quantile'] = df3['duration_quantile'].map(quantile_names)

In [30]:
df3['age_quantile'] = pd.qcut(df3['age'], q=10, labels=False, duplicates='drop')

# Mapping quantile groups to custom names
quantile_names = {i: f'A{i+130}' for i in range(df3['age_quantile'].nunique())}
df3['age_quantile'] = df3['age_quantile'].map(quantile_names)

In [31]:
# Define mapping dictionary
mapping_dict = {0:'A110', 1: 'A111', 2: 'A112', 3: 'A113', 4: 'A114', 5: 'A115', 6: 'A116', 7: 'A117'}

# Apply mapping to the column
df3['residence_since_coded'] = df3['residence_since'].map(mapping_dict)

In [32]:
# Define mapping dictionary
mapping_dict = {0:'A160', 1: 'A161', 2: 'A162', 3: 'A163', 4: 'A164', 5: 'A165', 6: 'A166', 7: 'A167'}

# Apply mapping to the column
df3['nr_of_existing_credits_coded'] = df3['nr_of_existing_credits'].map(mapping_dict)

In [33]:
# Define mapping dictionary
mapping_dict = {0:'A180', 1: 'A181', 2: 'A182', 3: 'A183', 4: 'A184', 5: 'A185', 6: 'A186', 7: 'A187'}

# Apply mapping to the column
df3['number_of_dependents_coded'] = df3['number_of_dependents'].map(mapping_dict)

In [34]:
df3.head()

Unnamed: 0,account_balance,duration,credit_history,purpose,credit_amount,savings_bond_value,employed_since,intallment_rate,sex_marital,guarantor,residence_since,property,age,other_installment_plans,type_of_housing,nr_of_existing_credits,job,number_of_dependents,telephone,foreign,target,credit_amount_quantile,installment_rate_coded,duration_quantile,age_quantile,residence_since_coded,nr_of_existing_credits_coded,number_of_dependents_coded
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,4,A121,67,A143,A152,2,A173,1,A192,A201,1,A51,A84,A20,A139,A114,A162,A181
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,2,A121,22,A143,A152,1,A173,1,A191,A201,2,A58,A82,A27,A130,A112,A161,A181
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,3,A121,49,A143,A152,1,A172,2,A191,A201,1,A54,A82,A21,A138,A113,A161,A182
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,4,A122,45,A143,A153,1,A173,2,A191,A201,1,A59,A82,A27,A137,A114,A161,A182
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,4,A124,53,A143,A153,2,A173,2,A191,A201,2,A58,A83,A24,A139,A114,A162,A182


### Data Contains values of Encodings (e.g. A11) and numerical values are categorized into groups based on quantiles [DF4] (the numerical columns are removed)

In [35]:
df4 = df1.copy()

In [36]:
df4['credit_amount_quantile'] = pd.qcut(df4['credit_amount'], q=10, labels=False, duplicates='drop')

# Mapping quantile groups to custom names
quantile_names = {i: f'A{i+50}' for i in range(df4['credit_amount_quantile'].nunique())}
df4['credit_amount'] = df4['credit_amount_quantile'].map(quantile_names)

df4.drop(columns='credit_amount_quantile', inplace=True)

In [37]:
# Define mapping dictionary
mapping_dict = {0:'A80', 1: 'A81', 2: 'A82', 3: 'A83', 4: 'A84', 5: 'A85', 6: 'A86', 7: 'A87'}

# Apply mapping to the column
df4['intallment_rate'] = df4['intallment_rate'].map(mapping_dict)

In [38]:
# Grouping the 'duration' column into 10% quantiles
df4['duration_quantile'] = pd.qcut(df4['duration'], q=10, labels=False, duplicates='drop')

# Mapping quantile groups to custom names
quantile_names = {i: f'A{i+20}' for i in range(df4['duration_quantile'].nunique())}
df4['duration'] = df4['duration_quantile'].map(quantile_names)

df4.drop(columns='duration_quantile', inplace=True)

In [39]:
df4['age_quantile'] = pd.qcut(df4['age'], q=10, labels=False, duplicates='drop')

# Mapping quantile groups to custom names
quantile_names = {i: f'A{i+130}' for i in range(df4['age_quantile'].nunique())}
df4['age'] = df4['age_quantile'].map(quantile_names)

df4.drop(columns='age_quantile', inplace=True)

In [40]:
# Define mapping dictionary
mapping_dict = {0:'A110', 1: 'A111', 2: 'A112', 3: 'A113', 4: 'A114', 5: 'A115', 6: 'A116', 7: 'A117'}

# Apply mapping to the column
df4['residence_since'] = df4['residence_since'].map(mapping_dict)

In [41]:
# Define mapping dictionary
mapping_dict = {0:'A160', 1: 'A161', 2: 'A162', 3: 'A163', 4: 'A164', 5: 'A165', 6: 'A166', 7: 'A167'}

# Apply mapping to the column
df4['nr_of_existing_credits'] = df4['nr_of_existing_credits'].map(mapping_dict)

In [42]:
# Define mapping dictionary
mapping_dict = {0:'A180', 1: 'A181', 2: 'A182', 3: 'A183', 4: 'A184', 5: 'A185', 6: 'A186', 7: 'A187'}

# Apply mapping to the column
df4['number_of_dependents'] = df4['number_of_dependents'].map(mapping_dict)

In [43]:
df4.head()

Unnamed: 0,account_balance,duration,credit_history,purpose,credit_amount,savings_bond_value,employed_since,intallment_rate,sex_marital,guarantor,residence_since,property,age,other_installment_plans,type_of_housing,nr_of_existing_credits,job,number_of_dependents,telephone,foreign,target
0,A11,A20,A34,A43,A51,A65,A75,A84,A93,A101,A114,A121,A139,A143,A152,A162,A173,A181,A192,A201,1
1,A12,A27,A32,A43,A58,A61,A73,A82,A92,A101,A112,A121,A130,A143,A152,A161,A173,A181,A191,A201,2
2,A14,A21,A34,A46,A54,A61,A74,A82,A93,A101,A113,A121,A138,A143,A152,A161,A172,A182,A191,A201,1
3,A11,A27,A32,A42,A59,A61,A74,A82,A93,A103,A114,A122,A137,A143,A153,A161,A173,A182,A191,A201,1
4,A11,A24,A33,A40,A58,A61,A73,A83,A93,A101,A114,A124,A139,A143,A153,A162,A173,A182,A191,A201,2


In [97]:
# Assuming df is your DataFrame
df4.to_csv('df4.csv')

### Data Contains the dataset converted with Label Encoder [DF5] (contiued on the work of DF4)

In [44]:
df5 = df4.copy()

In [45]:
#label encoding

from sklearn.preprocessing import LabelEncoder

# Assuming 'df' is your DataFrame
# Create a LabelEncoder object
label_encoder = LabelEncoder()

# Iterate through each column in the DataFrame
for column in df5.columns:
    # Check if the column is categorical
    if df5[column].dtype == 'object':
        # Use LabelEncoder to encode the categorical column
        df5[column] = label_encoder.fit_transform(df5[column])

In [46]:
df5.head()

Unnamed: 0,account_balance,duration,credit_history,purpose,credit_amount,savings_bond_value,employed_since,intallment_rate,sex_marital,guarantor,residence_since,property,age,other_installment_plans,type_of_housing,nr_of_existing_credits,job,number_of_dependents,telephone,foreign,target
0,0,0,4,4,1,4,4,3,2,0,3,0,9,2,1,1,2,0,1,0,1
1,1,7,2,4,8,0,2,1,1,0,1,0,0,2,1,0,2,0,0,0,2
2,3,1,4,7,4,0,3,1,2,0,2,0,8,2,1,0,1,1,0,0,1
3,0,7,2,3,9,0,3,1,2,2,3,1,7,2,2,0,2,1,0,0,1
4,0,4,3,0,8,0,2,2,2,0,3,3,9,2,2,1,2,1,0,0,2


In [47]:
# Assuming df is your DataFrame
df5.to_csv('df5.csv')

##### Data Contains the dataset converted with Label Encoder [DF5b] (continued on the work of DF3 - encoded and numerical values)

In [48]:
df5b = df3.copy()

In [49]:
#label encoding

from sklearn.preprocessing import LabelEncoder

# Assuming 'df' is your DataFrame
# Create a LabelEncoder object
label_encoder = LabelEncoder()

# Iterate through each column in the DataFrame
for column in df5b.columns:
    # Check if the column is categorical
    if df5b[column].dtype == 'object':
        # Use LabelEncoder to encode the categorical column
        df5b[column] = label_encoder.fit_transform(df5b[column])

In [50]:
df5b.head()

Unnamed: 0,account_balance,duration,credit_history,purpose,credit_amount,savings_bond_value,employed_since,intallment_rate,sex_marital,guarantor,residence_since,property,age,other_installment_plans,type_of_housing,nr_of_existing_credits,job,number_of_dependents,telephone,foreign,target,credit_amount_quantile,installment_rate_coded,duration_quantile,age_quantile,residence_since_coded,nr_of_existing_credits_coded,number_of_dependents_coded
0,0,6,4,4,1169,4,4,4,2,0,4,0,67,2,1,2,2,1,1,0,1,1,3,0,9,3,1,0
1,1,48,2,4,5951,0,2,2,1,0,2,0,22,2,1,1,2,1,0,0,2,8,1,7,0,1,0,0
2,3,12,4,7,2096,0,3,2,2,0,3,0,49,2,1,1,1,2,0,0,1,4,1,1,8,2,0,1
3,0,42,2,3,7882,0,3,2,2,2,4,1,45,2,2,1,2,2,0,0,1,9,1,7,7,3,0,1
4,0,24,3,0,4870,0,2,3,2,0,4,3,53,2,2,2,2,2,0,0,2,8,2,4,9,3,1,1


##### Data Contains the dataset converted with Label Encoder [DF5c] (continued on the work of DF3 - labeled and numerical values)

In [51]:
df5c = df5b.copy()

In [52]:
# Drop columns containing 'coded' in their names
columns_to_drop = [col for col in df5c.columns if 'coded' in col or 'quantile' in col]
df5c.drop(columns=columns_to_drop, inplace=True)

df5c.head()

Unnamed: 0,account_balance,duration,credit_history,purpose,credit_amount,savings_bond_value,employed_since,intallment_rate,sex_marital,guarantor,residence_since,property,age,other_installment_plans,type_of_housing,nr_of_existing_credits,job,number_of_dependents,telephone,foreign,target
0,0,6,4,4,1169,4,4,4,2,0,4,0,67,2,1,2,2,1,1,0,1
1,1,48,2,4,5951,0,2,2,1,0,2,0,22,2,1,1,2,1,0,0,2
2,3,12,4,7,2096,0,3,2,2,0,3,0,49,2,1,1,1,2,0,0,1
3,0,42,2,3,7882,0,3,2,2,2,4,1,45,2,2,1,2,2,0,0,1
4,0,24,3,0,4870,0,2,3,2,0,4,3,53,2,2,2,2,2,0,0,2


In [53]:
# Assuming df is your DataFrame
df5c.to_csv('df5c.csv')

### Data Contains the dataset converted with Label Encoder and modify it to be used by fasttext[DF7] (continued on the work of DF4)

In [54]:
df7 = df4.copy()

In [55]:
df7.head()

Unnamed: 0,account_balance,duration,credit_history,purpose,credit_amount,savings_bond_value,employed_since,intallment_rate,sex_marital,guarantor,residence_since,property,age,other_installment_plans,type_of_housing,nr_of_existing_credits,job,number_of_dependents,telephone,foreign,target
0,A11,A20,A34,A43,A51,A65,A75,A84,A93,A101,A114,A121,A139,A143,A152,A162,A173,A181,A192,A201,1
1,A12,A27,A32,A43,A58,A61,A73,A82,A92,A101,A112,A121,A130,A143,A152,A161,A173,A181,A191,A201,2
2,A14,A21,A34,A46,A54,A61,A74,A82,A93,A101,A113,A121,A138,A143,A152,A161,A172,A182,A191,A201,1
3,A11,A27,A32,A42,A59,A61,A74,A82,A93,A103,A114,A122,A137,A143,A153,A161,A173,A182,A191,A201,1
4,A11,A24,A33,A40,A58,A61,A73,A83,A93,A101,A114,A124,A139,A143,A153,A162,A173,A182,A191,A201,2


In [56]:
# Assuming your target column is named "target"
df7['target'] = '__label__' + df7['target'].astype(str)

In [57]:
df7.head()

Unnamed: 0,account_balance,duration,credit_history,purpose,credit_amount,savings_bond_value,employed_since,intallment_rate,sex_marital,guarantor,residence_since,property,age,other_installment_plans,type_of_housing,nr_of_existing_credits,job,number_of_dependents,telephone,foreign,target
0,A11,A20,A34,A43,A51,A65,A75,A84,A93,A101,A114,A121,A139,A143,A152,A162,A173,A181,A192,A201,__label__1
1,A12,A27,A32,A43,A58,A61,A73,A82,A92,A101,A112,A121,A130,A143,A152,A161,A173,A181,A191,A201,__label__2
2,A14,A21,A34,A46,A54,A61,A74,A82,A93,A101,A113,A121,A138,A143,A152,A161,A172,A182,A191,A201,__label__1
3,A11,A27,A32,A42,A59,A61,A74,A82,A93,A103,A114,A122,A137,A143,A153,A161,A173,A182,A191,A201,__label__1
4,A11,A24,A33,A40,A58,A61,A73,A83,A93,A101,A114,A124,A139,A143,A153,A162,A173,A182,A191,A201,__label__2


In [58]:
# Select columns from the second column till the last column
selected_columns = df7.iloc[:, :-1]

In [59]:
selected_columns.head()

Unnamed: 0,account_balance,duration,credit_history,purpose,credit_amount,savings_bond_value,employed_since,intallment_rate,sex_marital,guarantor,residence_since,property,age,other_installment_plans,type_of_housing,nr_of_existing_credits,job,number_of_dependents,telephone,foreign
0,A11,A20,A34,A43,A51,A65,A75,A84,A93,A101,A114,A121,A139,A143,A152,A162,A173,A181,A192,A201
1,A12,A27,A32,A43,A58,A61,A73,A82,A92,A101,A112,A121,A130,A143,A152,A161,A173,A181,A191,A201
2,A14,A21,A34,A46,A54,A61,A74,A82,A93,A101,A113,A121,A138,A143,A152,A161,A172,A182,A191,A201
3,A11,A27,A32,A42,A59,A61,A74,A82,A93,A103,A114,A122,A137,A143,A153,A161,A173,A182,A191,A201
4,A11,A24,A33,A40,A58,A61,A73,A83,A93,A101,A114,A124,A139,A143,A153,A162,A173,A182,A191,A201


In [60]:
# Create a new column "content" by concatenating values from selected columns
df7['content'] = df7['target'] + ' ' + selected_columns.apply(lambda row: ' '.join(str(val) for val in row), axis=1)

In [61]:
df7.head()

Unnamed: 0,account_balance,duration,credit_history,purpose,credit_amount,savings_bond_value,employed_since,intallment_rate,sex_marital,guarantor,residence_since,property,age,other_installment_plans,type_of_housing,nr_of_existing_credits,job,number_of_dependents,telephone,foreign,target,content
0,A11,A20,A34,A43,A51,A65,A75,A84,A93,A101,A114,A121,A139,A143,A152,A162,A173,A181,A192,A201,__label__1,__label__1 A11 A20 A34 A43 A51 A65 A75 A84 A93...
1,A12,A27,A32,A43,A58,A61,A73,A82,A92,A101,A112,A121,A130,A143,A152,A161,A173,A181,A191,A201,__label__2,__label__2 A12 A27 A32 A43 A58 A61 A73 A82 A92...
2,A14,A21,A34,A46,A54,A61,A74,A82,A93,A101,A113,A121,A138,A143,A152,A161,A172,A182,A191,A201,__label__1,__label__1 A14 A21 A34 A46 A54 A61 A74 A82 A93...
3,A11,A27,A32,A42,A59,A61,A74,A82,A93,A103,A114,A122,A137,A143,A153,A161,A173,A182,A191,A201,__label__1,__label__1 A11 A27 A32 A42 A59 A61 A74 A82 A93...
4,A11,A24,A33,A40,A58,A61,A73,A83,A93,A101,A114,A124,A139,A143,A153,A162,A173,A182,A191,A201,__label__2,__label__2 A11 A24 A33 A40 A58 A61 A73 A83 A93...


In [62]:
# Assuming df is your DataFrame
df7.to_csv('df7.csv')

### Data Contains the dataset converted with Label Encoder and modify it to be used by fasttext[DF7a] (continued on the work of DF2)

In [108]:
df7a = df2.copy()

In [109]:
df7a.head(2)

Unnamed: 0,account_balance,duration,credit_history,purpose,credit_amount,savings_bond_value,employed_since,intallment_rate,sex_marital,guarantor,residence_since,property,age,other_installment_plans,type_of_housing,nr_of_existing_credits,job,number_of_dependents,telephone,foreign,target
0,< 0,6,critical account/other credits existing (not a...,radio/television,1169,unknown/no savings account,>=7 years,4,male;single,bank,4,real estate,67,none,own,2,skilled employee;official,1,Yes,Yes,1
1,0-200 DM,48,existing credits paid back duly till now,radio/television,5951,< 100 DM,1-4 years,2,female;divorced/separated/married,bank,2,real estate,22,none,own,1,skilled employee;official,1,No,Yes,2


In [110]:
# Assuming your target column is named "target"
df7a['target'] = '__label__' + df7a['target'].astype(str)

In [111]:
df7a.head(2)

Unnamed: 0,account_balance,duration,credit_history,purpose,credit_amount,savings_bond_value,employed_since,intallment_rate,sex_marital,guarantor,residence_since,property,age,other_installment_plans,type_of_housing,nr_of_existing_credits,job,number_of_dependents,telephone,foreign,target
0,< 0,6,critical account/other credits existing (not a...,radio/television,1169,unknown/no savings account,>=7 years,4,male;single,bank,4,real estate,67,none,own,2,skilled employee;official,1,Yes,Yes,__label__1
1,0-200 DM,48,existing credits paid back duly till now,radio/television,5951,< 100 DM,1-4 years,2,female;divorced/separated/married,bank,2,real estate,22,none,own,1,skilled employee;official,1,No,Yes,__label__2


In [112]:
# Select columns from the second column till the last column
selected_columns = df7a.iloc[:, :-1]

In [113]:
selected_columns.head(2)

Unnamed: 0,account_balance,duration,credit_history,purpose,credit_amount,savings_bond_value,employed_since,intallment_rate,sex_marital,guarantor,residence_since,property,age,other_installment_plans,type_of_housing,nr_of_existing_credits,job,number_of_dependents,telephone,foreign
0,< 0,6,critical account/other credits existing (not a...,radio/television,1169,unknown/no savings account,>=7 years,4,male;single,bank,4,real estate,67,none,own,2,skilled employee;official,1,Yes,Yes
1,0-200 DM,48,existing credits paid back duly till now,radio/television,5951,< 100 DM,1-4 years,2,female;divorced/separated/married,bank,2,real estate,22,none,own,1,skilled employee;official,1,No,Yes


In [114]:
# Create a new column "content" by concatenating values from selected columns
df7a['content'] = df7a['target'] + ' ' + selected_columns.apply(lambda row: ' '.join(str(val) for val in row), axis=1)

In [116]:
df7a.head(2)

Unnamed: 0,account_balance,duration,credit_history,purpose,credit_amount,savings_bond_value,employed_since,intallment_rate,sex_marital,guarantor,residence_since,property,age,other_installment_plans,type_of_housing,nr_of_existing_credits,job,number_of_dependents,telephone,foreign,target,content
0,< 0,6,critical account/other credits existing (not a...,radio/television,1169,unknown/no savings account,>=7 years,4,male;single,bank,4,real estate,67,none,own,2,skilled employee;official,1,Yes,Yes,__label__1,__label__1 < 0 6 critical account/other credit...
1,0-200 DM,48,existing credits paid back duly till now,radio/television,5951,< 100 DM,1-4 years,2,female;divorced/separated/married,bank,2,real estate,22,none,own,1,skilled employee;official,1,No,Yes,__label__2,__label__2 0-200 DM 48 existing credits paid b...


In [117]:
# Assuming df is your DataFrame
df7a.to_csv('df7a.csv')

### Data Contains the dataset converted to One Hot [DF6] (contiued on the work of DF4)

In [63]:
df6 = df4.copy()

In [64]:
# Separate numerical and categorical columns
numerical_cols = df6.select_dtypes(include=['int', 'float']).columns
categorical_cols = df6.select_dtypes(include=['object']).columns

# Perform one-hot encoding on categorical columns
one_hot_encoded_cols = pd.get_dummies(df6[categorical_cols])

# Concatenate the one-hot encoded columns with the original DataFrame
df6_hot = pd.concat([one_hot_encoded_cols, df5[numerical_cols]], axis=1)

In [65]:
df6_hot.head()

Unnamed: 0,account_balance_A11,account_balance_A12,account_balance_A13,account_balance_A14,duration_A20,duration_A21,duration_A22,duration_A23,duration_A24,duration_A25,duration_A26,duration_A27,credit_history_A30,credit_history_A31,credit_history_A32,credit_history_A33,credit_history_A34,purpose_A40,purpose_A41,purpose_A410,purpose_A42,purpose_A43,purpose_A44,purpose_A45,purpose_A46,purpose_A48,purpose_A49,credit_amount_A50,credit_amount_A51,credit_amount_A52,credit_amount_A53,credit_amount_A54,credit_amount_A55,credit_amount_A56,credit_amount_A57,credit_amount_A58,credit_amount_A59,savings_bond_value_A61,savings_bond_value_A62,savings_bond_value_A63,savings_bond_value_A64,savings_bond_value_A65,employed_since_A71,employed_since_A72,employed_since_A73,employed_since_A74,employed_since_A75,intallment_rate_A81,intallment_rate_A82,intallment_rate_A83,intallment_rate_A84,sex_marital_A91,sex_marital_A92,sex_marital_A93,sex_marital_A94,guarantor_A101,guarantor_A102,guarantor_A103,residence_since_A111,residence_since_A112,residence_since_A113,residence_since_A114,property_A121,property_A122,property_A123,property_A124,age_A130,age_A131,age_A132,age_A133,age_A134,age_A135,age_A136,age_A137,age_A138,age_A139,other_installment_plans_A141,other_installment_plans_A142,other_installment_plans_A143,type_of_housing_A151,type_of_housing_A152,type_of_housing_A153,nr_of_existing_credits_A161,nr_of_existing_credits_A162,nr_of_existing_credits_A163,nr_of_existing_credits_A164,job_A171,job_A172,job_A173,job_A174,number_of_dependents_A181,number_of_dependents_A182,telephone_A191,telephone_A192,foreign_A201,foreign_A202,target
0,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,True,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,True,False,False,True,False,False,False,False,True,False,True,False,False,True,True,False,1
1,False,True,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,True,False,False,False,True,False,False,False,True,False,False,True,False,False,False,True,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,True,False,False,False,False,False,True,False,True,False,True,False,True,False,2
2,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,True,False,True,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,True,False,True,False,False,False,False,True,False,False,False,True,True,False,True,False,1
3,True,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,True,True,False,False,False,False,False,True,False,False,True,True,False,True,False,1
4,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,True,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False,True,False,False,False,False,True,False,False,True,True,False,True,False,2


### DF7b ([DF7 dataset values replaced with vectors through word2vec])

In [66]:
df7b = df7.copy()

In [67]:
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from gensim.models import Word2Vec
import numpy as np
np.random.seed(42)

In [68]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, precision_score, recall_score, f1_score, accuracy_score, confusion_matrix

In [69]:
features = df7['content'].apply(lambda x: x.split(' ', 1)[1])  # Drop the target column to get the feature columns
target = df7['target'].apply(lambda x: int(x.split("__label__")[1]))  # Target column to predict

In [70]:
stop_words = set(stopwords.words('english'))
def preprocess(text):
    text = text.lower()
    text = ''.join([word for word in text if word not in string.punctuation])
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

features = features.apply(preprocess)

In [71]:
sentences = [sentence.split() for sentence in features]

In [72]:
w2v_model = Word2Vec(sentences, vector_size=20, window=21, min_count=2, workers=4, seed=42) #size=100,

In [73]:
import numpy as np

In [74]:
# Assuming w2v_model is your Word2Vec model object
vocabulary = list(w2v_model.wv.key_to_index.keys())

# Print the vocabulary
print("Vocabulary size:", len(vocabulary))
print("Example words in the vocabulary:")
for word in vocabulary[:2]:
    print(word)


Vocabulary size: 96
Example words in the vocabulary:
a201
a101


In [75]:
df7b = df4.copy()

In [76]:
df7b = df7b.applymap(lambda x: x.lower() if isinstance(x, str) else x)

In [77]:
df7b.head(2)

Unnamed: 0,account_balance,duration,credit_history,purpose,credit_amount,savings_bond_value,employed_since,intallment_rate,sex_marital,guarantor,residence_since,property,age,other_installment_plans,type_of_housing,nr_of_existing_credits,job,number_of_dependents,telephone,foreign,target
0,a11,a20,a34,a43,a51,a65,a75,a84,a93,a101,a114,a121,a139,a143,a152,a162,a173,a181,a192,a201,1
1,a12,a27,a32,a43,a58,a61,a73,a82,a92,a101,a112,a121,a130,a143,a152,a161,a173,a181,a191,a201,2


In [78]:
# Vectorize the values in the column using Word2Vec
#df7b['account_balance'] = df7b['account_balance'].apply(lambda words: np.mean([w2v_model.wv[word] for word in words.split() if word in w2v_model.wv], axis=0))

In [79]:
# Iterate over each column in df7b
for col in df7b.columns:
    # Check if the column contains strings
    if df7b[col].dtype == 'object':
        # Lowercase the words in the column
        df7b[col] = df7b[col].apply(lambda x: x.lower() if isinstance(x, str) else x)
        # Vectorize the values in the column using Word2Vec
        df7b[col] = df7b[col].apply(lambda words: np.mean([w2v_model.wv[word] for word in words.split() if word in w2v_model.wv], axis=0) if isinstance(words, str) else np.nan)

In [80]:
df7b.head(2)

Unnamed: 0,account_balance,duration,credit_history,purpose,credit_amount,savings_bond_value,employed_since,intallment_rate,sex_marital,guarantor,residence_since,property,age,other_installment_plans,type_of_housing,nr_of_existing_credits,job,number_of_dependents,telephone,foreign,target
0,"[0.5123656, 0.08721851, -0.04502545, 0.5276825...","[0.40796077, 0.13289225, -0.010721123, 0.41913...","[0.5123638, 0.14454436, -0.08207153, 0.5581904...","[0.47953433, 0.08613269, -0.09942285, 0.475111...","[0.4542883, 0.09474913, 0.00062427256, 0.48682...","[0.4543795, 0.15749195, -0.02969824, 0.4704520...","[0.40379667, 0.13925527, -0.013040015, 0.53484...","[0.4173511, 0.122742996, -0.03808109, 0.460046...","[0.5095015, 0.07652485, -0.06183192, 0.5244075...","[0.45179877, 0.09723043, -0.09049437, 0.548357...","[0.4108535, 0.14049871, -0.10504845, 0.526732,...","[0.5103311, 0.1447146, -0.030989435, 0.5001373...","[0.3103693, 0.07290498, -0.041894984, 0.442568...","[0.48364577, 0.14798652, -0.077428155, 0.53864...","[0.53804064, 0.1050709, -0.0975563, 0.55001956...","[0.40061048, 0.06251639, -0.023368098, 0.50267...","[0.39778998, 0.13783263, -0.043663766, 0.44411...","[0.40816295, 0.14921758, -0.039182045, 0.46046...","[0.5104352, 0.085671745, -0.09236418, 0.569349...","[0.39140326, 0.15424466, -0.04632555, 0.466174...",1
1,"[0.5170593, 0.1741925, -0.09689912, 0.610715, ...","[0.47122538, 0.1741397, -0.039951626, 0.475930...","[0.47755164, 0.17640159, -0.067032106, 0.55229...","[0.47953433, 0.08613269, -0.09942285, 0.475111...","[0.3908935, 0.10919785, 0.0057443664, 0.426221...","[0.49954376, 0.09603415, -0.098292194, 0.47258...","[0.44272077, 0.15919638, -0.11206351, 0.537121...","[0.4853572, 0.08954783, -0.05937726, 0.4888126...","[0.51028377, 0.16105807, -0.09500858, 0.541432...","[0.45179877, 0.09723043, -0.09049437, 0.548357...","[0.39959398, 0.1741079, -0.007968137, 0.453570...","[0.5103311, 0.1447146, -0.030989435, 0.5001373...","[0.35772812, 0.12608181, -0.060044985, 0.35481...","[0.48364577, 0.14798652, -0.077428155, 0.53864...","[0.53804064, 0.1050709, -0.0975563, 0.55001956...","[0.46808314, 0.07567606, -0.020956784, 0.41933...","[0.39778998, 0.13783263, -0.043663766, 0.44411...","[0.40816295, 0.14921758, -0.039182045, 0.46046...","[0.5388965, 0.1310428, -0.07577989, 0.503798, ...","[0.39140326, 0.15424466, -0.04632555, 0.466174...",2


In [81]:
# Assuming df is your DataFrame
df7b.to_csv('df7b.csv')

### DF7C ([DF7 dataset values replaced with vectors through FastText])

In [82]:
from gensim.models import FastText

In [83]:
features = df7['content'].apply(lambda x: x.split(' ', 1)[1])  # Drop the target column to get the feature columns
target = df7['target'].apply(lambda x: int(x.split("__label__")[1]))  # Target column to predict

In [84]:
stop_words = set(stopwords.words('english'))
def preprocess(text):
    text = text.lower()
    text = ''.join([word for word in text if word not in string.punctuation])
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

features = features.apply(preprocess)

In [85]:
sentences = [sentence.split() for sentence in features]

In [86]:
# Train the FastText model
fasttext_model = FastText(sentences, vector_size=20, window=21, min_count=2, workers=4, seed=42)

In [87]:
df7c = df4.copy()

In [88]:
df7c = df7c.applymap(lambda x: x.lower() if isinstance(x, str) else x)

In [89]:
df7c.head(2)

Unnamed: 0,account_balance,duration,credit_history,purpose,credit_amount,savings_bond_value,employed_since,intallment_rate,sex_marital,guarantor,residence_since,property,age,other_installment_plans,type_of_housing,nr_of_existing_credits,job,number_of_dependents,telephone,foreign,target
0,a11,a20,a34,a43,a51,a65,a75,a84,a93,a101,a114,a121,a139,a143,a152,a162,a173,a181,a192,a201,1
1,a12,a27,a32,a43,a58,a61,a73,a82,a92,a101,a112,a121,a130,a143,a152,a161,a173,a181,a191,a201,2


In [90]:
# Iterate over each column in df7b
for col in df7c.columns:
    # Check if the column contains strings
    if df7c[col].dtype == 'object':
        # Lowercase the words in the column
        df7c[col] = df7c[col].apply(lambda x: x.lower() if isinstance(x, str) else x)
        # Vectorize the values in the column using FastText
        df7c[col] = df7c[col].apply(lambda words: np.mean([fasttext_model.wv[word] for word in words.split() if word in fasttext_model.wv], axis=0) if isinstance(words, str) else np.nan)

In [91]:
#fasttext_model.wv['a201']

In [92]:
df7c.head(2)

Unnamed: 0,account_balance,duration,credit_history,purpose,credit_amount,savings_bond_value,employed_since,intallment_rate,sex_marital,guarantor,residence_since,property,age,other_installment_plans,type_of_housing,nr_of_existing_credits,job,number_of_dependents,telephone,foreign,target
0,"[0.25201422, -0.58022404, 0.023599776, 1.08927...","[0.215892, -0.4885948, 0.015643708, 0.95023763...","[0.21190116, -0.4833613, 0.006333941, 0.931256...","[0.21696539, -0.48182616, 0.0077609913, 0.8861...","[0.18598916, -0.42411903, 0.019267196, 0.79975...","[0.19958936, -0.43781897, 0.031985257, 0.85318...","[0.22179072, -0.47679776, 0.04253314, 0.919203...","[0.1973424, -0.45717022, 0.021845307, 0.892923...","[0.2211176, -0.4650231, -0.010987164, 0.896041...","[0.22406635, -0.49421123, 0.013279202, 0.91488...","[0.19487606, -0.4619316, 0.019130161, 0.888033...","[0.20060587, -0.46536607, 0.0015388117, 0.8438...","[0.15387219, -0.4191745, 0.0045993496, 0.74472...","[0.19974351, -0.43976405, 0.028822983, 0.83793...","[0.22761442, -0.4582686, 0.009384564, 0.847235...","[0.20649101, -0.46065763, 0.029982258, 0.85346...","[0.18554501, -0.45078403, 0.025518404, 0.82516...","[0.21775311, -0.442762, 0.015221607, 0.8413076...","[0.2143011, -0.45749387, 0.0028023259, 0.85972...","[0.19656268, -0.46506938, 0.01377346, 0.888012...",1
1,"[0.24850321, -0.6013364, 0.01988187, 1.147242,...","[0.20415176, -0.4344493, 0.023065275, 0.864385...","[0.232789, -0.51465833, 0.013344884, 0.972002,...","[0.21696539, -0.48182616, 0.0077609913, 0.8861...","[0.17740902, -0.39528602, 0.011153662, 0.78091...","[0.23077331, -0.4974076, 0.028308408, 0.925245...","[0.21657798, -0.46179995, 0.017330352, 0.91657...","[0.2110451, -0.490981, 0.018941091, 0.9297582,...","[0.24637412, -0.45646873, 0.004152208, 0.90138...","[0.22406635, -0.49421123, 0.013279202, 0.91488...","[0.18072987, -0.43156606, 0.020819811, 0.83724...","[0.20060587, -0.46536607, 0.0015388117, 0.8438...","[0.1726764, -0.40127572, -0.0022063986, 0.7224...","[0.19974351, -0.43976405, 0.028822983, 0.83793...","[0.22761442, -0.4582686, 0.009384564, 0.847235...","[0.19709826, -0.43856522, 0.02222509, 0.821076...","[0.18554501, -0.45078403, 0.025518404, 0.82516...","[0.21775311, -0.442762, 0.015221607, 0.8413076...","[0.21061559, -0.46064204, 0.01336069, 0.842103...","[0.19656268, -0.46506938, 0.01377346, 0.888012...",2


In [93]:
# Assuming df is your DataFrame
df7c.to_csv('df7c.csv')

# OLS Regression

### finding the importance of each attribute with target as dependent variable

In [94]:
# Assuming 'df' is your DataFrame with features and the target column
# Replace 'features' with the actual list of feature columns
features = df5c.drop(columns=['target'])  # Drop the target column to get the feature columns
target = df5c['target']  # Target column to predict

# Splitting the dataset into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [95]:
import statsmodels.api as sm

In [96]:
# Add a constant term to the features (intercept)
X_train_with_const = sm.add_constant(X_train)
X_test_with_const = sm.add_constant(X_test)

# Fit the OLS regression model
model = sm.OLS(y_train, X_train_with_const)
result = model.fit()

In [1]:
result

NameError: name 'result' is not defined