In [None]:
!pip install pandas numpy matplotlib scikit-learn tensorflow keras
!pip install human-learn

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from itertools import product
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Input, BatchNormalization, Flatten, Reshape
from keras.callbacks import EarlyStopping
import time

In [2]:
#file_path = 'datasets/data_1.csv'
file_path = 'datasets/data_2.xlsx'
#data = pd.read_csv(file_path) 
data = pd.read_excel(file_path) 

In [3]:
data.dtypes

Unnamed: 0     int64
First Name    object
Last Name     object
Gender        object
Country       object
Age            int64
Date          object
Id             int64
dtype: object

In [4]:
data

Unnamed: 0.1,Unnamed: 0,First Name,Last Name,Gender,Country,Age,Date,Id
0,1,Dulce,Abril,Female,United States,32,15/10/2017,1562
1,2,Mara,Hashimoto,Female,Great Britain,25,16/08/2016,1582
2,3,Philip,Gent,Male,France,36,21/05/2015,2587
3,4,Kathleen,Hanner,Female,United States,25,15/10/2017,3549
4,5,Nereida,Magwood,Female,United States,58,16/08/2016,2468
...,...,...,...,...,...,...,...,...
4995,4996,Roma,Lafollette,Female,United States,34,15/10/2017,2654
4996,4997,Felisa,Cail,Female,United States,28,16/08/2016,6525
4997,4998,Demetria,Abbey,Female,United States,32,21/05/2015,3265
4998,4999,Jeromy,Danz,Male,United States,39,15/10/2017,3265


In [5]:
df = data.copy()
df['Country'] = df['Country'].astype('str')

In [6]:
class Hash:

    def __init__(self, str_object: str) -> None:
        self.hash_value = 0
        self.p = 31
        self.m = 10**9 + 7
        self.length = len(str_object)
        hash_so_far = 0
        p_pow = 1

        for i in range(self.length):
            hash_so_far = (hash_so_far + (1 + ord(str_object[i]) - ord('a')) * p_pow) % self.m
            p_pow = (p_pow * self.p) % self.m
            
        self.hash_value = hash_so_far
     
    def __eq__(self, other) -> int:
        return self.hash_value == other.hash_value

In [7]:
class DataTransform:

    def __init__(self, data_frame: pd.core.frame.DataFrame, target_data: pd.core.series.Series):
        self.count_categories = 30
        self.one_hot_encoder = OneHotEncoder(sparse_output=False)
        self.label_encoder = LabelEncoder()
        self.features = data.columns
        self.features_types = {}
        self.target = target_data
        self.classes = {}
        self.norm_target(self.target, data_frame)
        self.transformed_data = pd.DataFrame()
        self.arguable_type = 'object'
        self.set_data_type(data_frame)
        self.transform_object_series(data_frame)
        self.transformed_data = self.transformed_data.drop([target_data.name], axis=1)

    def hashing_object_series(self, data_series: pd.core.series.Series) -> pd.core.series.Series:
        data_series = data_series.apply(lambda x: Hash(x).hash_value)
        return data_series

    def norm_target(self, data_series: pd.core.series.Series, data: pd.core.frame.DataFrame):
        if self.check_categorical(data_series):
            encoded_target = self.one_hot_encoder.fit_transform(data[[data_series.name]])
            self.classes = dict(zip(self.target, encoded_target))
            self.target = encoded_target
            self.classes = {np.argmax(cat_array):label for label, cat_array in self.classes.items()}

    def set_data_type(self, data: pd.core.frame.DataFrame) -> None:
        self.features_types = {feature: self.check_type(data[feature]) for feature in self.features}  
  
    def check_categorical(self, data_series: pd.core.series.Series) -> bool:
        categorical_cond = False
        count_unique = data_series.nunique()
        if count_unique/len(data_series) < self.count_categories/len(data_series):
            categorical_cond = True
        return categorical_cond

    def check_type(self, data_series: pd.core.series.Series) -> str:
    
        str_data = []
        int_data = []
        float_data = []
        #datetime_data = []
        if data_series.dtypes == self.arguable_type:
            
            for el in data_series:
                
                if isinstance(el, str):
                    
                    str_data.append(el)
                if isinstance(el, int):
                    int_data.append(el)
                if isinstance(el, float):
                    float_data.append(el)
            max_length = max(len(str_data), len(int_data), len(float_data))
            if max_length == len(str_data):
                series_type = 'object'
            elif max_length == len(int_data):
                series_type = 'int64'
            else:
                series_type = 'float64'
        else:
            series_type = data_series.dtype.name
        return series_type   

    def transform_object_series(self, data: pd.core.frame.DataFrame) -> None:
   
        for feature in self.features:
            if 'date' in feature.lower():
                self.transformed_data[feature] = pd.to_datetime(data[feature], format='%d/%m/%Y')
            else:
                if self.features_types[feature] == self.arguable_type:
                    if not self.check_categorical(data[feature]):
                        self.transformed_data[feature] = self.hashing_object_series(data[feature])
                        self.transformed_data[feature] = self.transformed_data[feature]/self.transformed_data[feature].abs().max()
                    else:
                        self.label_encoder.fit(data[feature])
                        self.transformed_data[feature] = self.label_encoder.transform(data[feature])


In [8]:
df_transform = DataTransform(df, df['Gender'])

print(df_transform.features_types)
target_classes = df_transform.classes
target_classes

{'Unnamed: 0': 'int64', 'First Name': 'object', 'Last Name': 'object', 'Gender': 'object', 'Country': 'object', 'Age': 'int64', 'Date': 'object', 'Id': 'int64'}


{2: 'Female', 3: 'Male', 0: '34fd', 1: '3ew', 5: 'dfsve', 4: 'Nan'}

In [9]:
X = df_transform.transformed_data.drop(['Date'], axis=1)
y = df_transform.target

In [10]:
num_features = X.shape[1]
num_classes = y.shape[1]
print(f'Num features: {num_features}')
print(f'Num classes: {num_classes}')

Num features: 3
Num classes: 6


In [11]:
print(f'X data:\n {X} \ny data:\n {y}')

X data:
       First Name  Last Name  Country
0       0.004751   0.011436        3
1       0.000047   0.312931        2
2       0.469922   0.000613        1
3       0.773831   0.523516        3
4       0.010570   1.000000        3
...          ...        ...      ...
4995    0.000043   0.981280        3
4996    0.046772   0.000368        3
4997    0.034336   0.023379        3
4998    0.733156   0.000793        3
4999    0.212083   0.161015        3

[5000 rows x 3 columns] 
y data:
 [[0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 ...
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0.]]


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [13]:
def create_model(activation='relu', optimizer='adam'):
    model = Sequential([
        Reshape((1, num_features), input_shape=(num_features, )),
        LSTM(512, return_sequences=False),
        Flatten(),
        Dense(256, activation=activation),
        BatchNormalization(),
        Dropout(0.2),
        Dense(128, activation=activation),
        Dropout(0.2),
        Dense(64, activation=activation),
        Dropout(0.2),
        Dense(32, activation=activation),
        Dropout(0.2),
        Dense(16, activation=activation),
        BatchNormalization(),
        Dropout(0.2),
        Dense(num_classes, activation='softmax')
            ])
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    return model

model = create_model(activation='relu', optimizer='adam')
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 reshape (Reshape)           (None, 1, 3)              0         
                                                                 
 lstm (LSTM)                 (None, 512)               1056768   
                                                                 
 flatten (Flatten)           (None, 512)               0         
                                                                 
 dense (Dense)               (None, 256)               131328    
                                                                 
 batch_normalization (BatchN  (None, 256)              1024      
 ormalization)                                                   
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                        

In [14]:
callback = EarlyStopping(monitor='loss', patience=3)

start_time = time.time()

history = model.fit(X_train, y_train, epochs=200, batch_size=128, callbacks=[callback])

end_time = time.time()

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200


In [15]:
score = model.evaluate(X_test, y_test, verbose=0)
print(f'Test loss: {score[0]}')
print(f'Test accuracy: {score[1]*100} %')

Test loss: 0.2815310060977936
Test accuracy: 90.39999842643738 %


In [16]:
y_predict = model.predict(X)



In [18]:
ll = []
non_ll = []
pred_ll = []
predict_1 = [np.argmax(pred) for pred in y_predict]
for pred, test in zip(y_predict, y):
    if np.argmax(pred) == np.argmax(test):
        ll.append(1)
    if np.argmax(test) not in predict_1:
        print(f'Error: {target_classes[np.argmax(test)]} can be {target_classes[np.argmax(pred)]}')   
print(f'Accuracy: {len(ll)/len(y_predict)}')


Error: 34fd can be Female
Error: 3ew can be Female
Error: dfsve can be Female
Error: Nan can be Female
Accuracy: 0.8994
