In [1]:
import json
import collections
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, Embedding, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy
import joblib
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
import warnings
warnings.filterwarnings('ignore')

# 1. Income classifier
Using a dataset of people's personal information to determine their likely income, as in whether or not they manage to earn more than 50K anually.

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv',
                skipinitialspace=True)
x_cols = [c for c in df.columns if c != 'income']
X = df[x_cols]
y = df['income']
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)

In [9]:
train_mode = dict(X_train.mode().iloc[0])
X_train = X_train.fillna(train_mode)
print (train_mode)

{'age': 31.0, 'workclass': 'Private', 'fnlwgt': 121124, 'education': 'HS-grad', 'education-num': 9.0, 'marital-status': 'Married-civ-spouse', 'occupation': 'Prof-specialty', 'relationship': 'Husband', 'race': 'White', 'sex': 'Male', 'capital-gain': 0.0, 'capital-loss': 0.0, 'hours-per-week': 40.0, 'native-country': 'United-States'}


In [10]:
encoder = {}
for col in ['workclass','education','marital-status','occupation','relationship','race','sex','native-country']:
    cat_convert = LabelEncoder()
    X_train[col] = cat_convert.fit_transform(X_train[col])
    encoder[col] = cat_convert

In [11]:
rf = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)
et = ExtraTreesClassifier(n_estimators=100).fit(X_train, y_train)

In [13]:
joblib.dump(train_mode, "./train_mode.joblib", compress=True)
joblib.dump(encoder, "./encoders.joblib", compress=True)
joblib.dump(rf, "./random_forest.joblib", compress=True)
joblib.dump(et, "./extra_trees.joblib", compress=True)

['./extra_trees.joblib']

# 2. Premium Insurance
Using a dataset of people's medical records to determine the likeliest premium insurance prices (in indian rupees) they will have to pay.

In [3]:
data = pd.read_csv('Medicalpremium.csv')
data.head()

Unnamed: 0,Age,Diabetes,BloodPressureProblems,AnyTransplants,AnyChronicDiseases,Height,Weight,KnownAllergies,HistoryOfCancerInFamily,NumberOfMajorSurgeries,PremiumPrice
0,45,0,0,0,0,155,57,0,0,0,25000
1,60,1,0,0,0,180,73,0,0,0,29000
2,36,1,1,0,0,158,59,0,0,1,23000
3,52,1,1,0,1,183,93,0,0,2,28000
4,38,0,0,0,1,166,88,0,0,1,23000


In [4]:
X = data.iloc[:,:-1]
y = data.iloc[:,-1]
X.shape

(986, 10)

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)
train_mode = dict(X_train.mode().iloc[0])
X_train = X_train.fillna(train_mode)
print (train_mode)

{'Age': 45.0, 'Diabetes': 0.0, 'BloodPressureProblems': 0.0, 'AnyTransplants': 0.0, 'AnyChronicDiseases': 0.0, 'Height': 174.0, 'Weight': 70.0, 'KnownAllergies': 0.0, 'HistoryOfCancerInFamily': 0.0, 'NumberOfMajorSurgeries': 0.0}


In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

n_train = int(len(X)*0.7)
classifiers = [RandomForestClassifier(),
               SVC(kernel='linear'),
               DecisionTreeClassifier(),
               LogisticRegression(solver="liblinear", max_iter=100),
               SGDClassifier(),
              GaussianNB(),
              KNeighborsClassifier(),
              MLPClassifier()]

for model in classifiers:
    print("{} accuracy: {}".format(model.__class__.__name__, cross_val_score(model, X_train, y_train, scoring="accuracy").mean()))



RandomForestClassifier accuracy: 0.8956521739130435
SVC accuracy: 0.8028985507246377
DecisionTreeClassifier accuracy: 0.8782608695652174
LogisticRegression accuracy: 0.7333333333333334
SGDClassifier accuracy: 0.2753623188405797
GaussianNB accuracy: 0.21449275362318837
KNeighborsClassifier accuracy: 0.5840579710144927
MLPClassifier accuracy: 0.7333333333333334


In [8]:
rf = RandomForestClassifier().fit(X_train, y_train)
dt = DecisionTreeClassifier().fit(X_train, y_train)
joblib.dump(train_mode, "./pi_train_mode.joblib", compress=True)
joblib.dump(rf, "./pi_random_forest.joblib", compress=True)
joblib.dump(dt, "./pi_decision_tree.joblib", compress=True)

['./pi_decision_tree.joblib']

# 3. Machine translation
Using two sets of equivalent sentences in English and French respectively to train model to translate English into French.

In [3]:
english_sentences = []
french_sentences = []
with open("small_vocab_en.txt") as entext:
    for s in entext:
        english_sentences += [s]
with open("small_vocab_fr.txt", encoding='utf-8') as frtext:
    for s in frtext:
        french_sentences += [s]
print(len(english_sentences), len(french_sentences))

137863 137863


In [4]:
for sample_i in range(5,7):
    print('small_vocab_en Line {}:  {}'.format(sample_i + 1, english_sentences[sample_i]))
    print('small_vocab_fr Line {}:  {}'.format(sample_i + 1, french_sentences[sample_i]))

small_vocab_en Line 6:  she is the kindest of little sisters , and I am proud of her .

small_vocab_fr Line 6:  elle est la plus gentille des petites sœurs, et je suis fier d'elle .

small_vocab_en Line 7:  his favorite food is eggs , but he has no appetite for it at the moment .

small_vocab_fr Line 7:  sa nourriture préférée est les œufs , mais il n'a pas d'appétit pour ça .



In [5]:
english_words_counter = collections.Counter([word for sentence in english_sentences for word in sentence.split()])
french_words_counter = collections.Counter([word for sentence in french_sentences for word in sentence.split()])
print('{} English words.'.format(len([word for sentence in english_sentences for word in sentence.split()])))
print('{} unique English words.'.format(len(english_words_counter)))
print('10 Most common words in the English dataset:')
print('"' + '" "'.join(list(zip(*english_words_counter.most_common(10)))[0]) + '"')
print()
print('{} French words.'.format(len([word for sentence in french_sentences for word in sentence.split()])))
print('{} unique French words.'.format(len(french_words_counter)))
print('10 Most common words in the French dataset:')
print('"' + '" "'.join(list(zip(*french_words_counter.most_common(10)))[0]) + '"')

1823294 English words.
245 unique English words.
10 Most common words in the English dataset:
"is" "," "." "in" "it" "during" "the" "but" "and" "sometimes"

1961336 French words.
373 unique French words.
10 Most common words in the French dataset:
"est" "." "," "en" "il" "les" "mais" "et" "la" "parfois"


In [6]:
def tokenize(x):
    x_tk = Tokenizer(char_level = False)
    x_tk.fit_on_texts(x)
    return x_tk.texts_to_sequences(x), x_tk

def pad(x, length=None):
    if length is None:
        length = max([len(sentence) for sentence in x])
    return pad_sequences(x, maxlen = length, padding='post')

def preprocess(x, y):
    x_pre, x_tk = tokenize(x)
    y_pre, y_tk = tokenize(y)
    x_pre = pad(x_pre)
    y_pre = pad(y_pre)
    y_pre = y_pre.reshape(*y_pre.shape, 1)
    return x_pre, y_pre, x_tk, y_tk

en_pre, fr_pre, en_tk, fr_tk = preprocess(english_sentences, french_sentences)
print("Max English sentence length:", en_pre.shape[1])
print("Max French sentence length:", fr_pre.shape[1])
print("English vocabulary size:", len(en_tk.word_index))
print("French vocabulary size:", len(fr_tk.word_index))

Max English sentence length: 15
Max French sentence length: 21
English vocabulary size: 216
French vocabulary size: 362


In [7]:
def logits_to_text(logits, tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'
    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

def custom_model(input_shape, output_len=fr_pre.shape[1], en_vocab_size=len(en_tk.word_index)+1, fr_vocab_size=len(fr_tk.word_index)+1, learning_rate=0.005):
    input_layer = Input(shape=(input_shape[1],))
    x = Embedding(en_vocab_size, 128)(input_layer)
    x = Bidirectional(GRU(256, return_sequences=False))(x)
    x = RepeatVector(output_len)(x)
    x = Bidirectional(GRU(256, return_sequences=True))(x)
    x = TimeDistributed(Dense(fr_vocab_size, activation="softmax"))(x)
    model = tf.keras.models.Model(inputs=input_layer, outputs=x)
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    return model

tmp_x = pad(en_pre)
model = custom_model(tmp_x.shape)
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 15)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 15, 128)           27776     
_________________________________________________________________
bidirectional (Bidirectional (None, 512)               592896    
_________________________________________________________________
repeat_vector (RepeatVector) (None, 21, 512)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 21, 512)           1182720   
_________________________________________________________________
time_distributed (TimeDistri (None, 21, 363)           186219    
Total params: 1,989,611
Trainable params: 1,989,611
Non-trainable params: 0
___________________________________________________

In [8]:
tmp_X = pad(en_pre)
model = custom_model(tmp_X.shape)
def final_predictions(x, y, x_tk, y_tk):
    
    model.fit(tmp_X, fr_pre, batch_size = 1024, epochs = 17, validation_split = 0.2)
 
    y_id_to_word = {value: key for key, value in y_tk.word_index.items()}
    y_id_to_word[0] = '<PAD>'
    sentence = 'he saw a old yellow truck'
    sentence = [x_tk.word_index[word] for word in sentence.split()]
    sentence = pad_sequences([sentence], maxlen=x.shape[-1], padding='post')
    sentences = np.array([sentence[0], x[0]])
    predictions = model.predict(sentences, len(sentences))
    print('Sample 1:')
    print(' '.join([y_id_to_word[np.argmax(x)] for x in predictions[0]]))
    print('Il a vu un vieux camion jaune')
    print('Sample 2:')
    print(' '.join([y_id_to_word[np.argmax(x)] for x in predictions[1]]))
    print(' '.join([y_id_to_word[np.max(x)] for x in y[0]]))
    
final_predictions(en_pre, fr_pre, en_tk, fr_tk)

Epoch 1/17
Epoch 2/17
Epoch 3/17
Epoch 4/17
Epoch 5/17
Epoch 6/17
Epoch 7/17
Epoch 8/17
Epoch 9/17
Epoch 10/17
Epoch 11/17
Epoch 12/17
Epoch 13/17
Epoch 14/17
Epoch 15/17
Epoch 16/17
Epoch 17/17
Sample 1:
il a vu un vieux camion jaune <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Il a vu un vieux camion jaune
Sample 2:
new jersey est parfois calme pendant l' automne et il est neigeux en avril <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
new jersey est parfois calme pendant l' automne et il est neigeux en avril <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [9]:
model.save("en_fr_translator")



INFO:tensorflow:Assets written to: en_fr_translator\assets


INFO:tensorflow:Assets written to: en_fr_translator\assets


In [11]:
sentence = 'he saw a old yellow truck'
sentence = [en_tk.word_index[word] for word in sentence.split()]
sentence = pad_sequences([sentence], maxlen=en_pre.shape[-1], padding='post')
sentences = np.array([sentence[0], en_pre[0]])
sentence

array([[ 26, 127, 100, 111, 112, 101,   0,   0,   0,   0,   0,   0,   0,
          0,   0]])

In [26]:
class MTProcessor:
    def __init__(self, lang_from, lang_tk, target_tk):
        self.dict = lang_from
        self.token = lang_tk
        self.target = target_tk
        
    def preprocess(self, data):
        data = [self.token.word_index[word] if word in self.token.word_index else 0 for word in data.split() if word in self.token.word_index]
        data = pad_sequences([data], maxlen=self.dict.shape[-1], padding='post')
        return data
    
    def postprocess(self, data):
        y_id = {value: key for key, value in self.target.word_index.items()}
        y_id[0] = ' '
        return ' '.join([y_id[np.argmax(x)] for x in data]).strip()
    
processor = MTProcessor(en_pre, en_tk, fr_tk)
joblib.dump(processor, './mtprocessor.joblib', compress=True)

['./mtprocessor.joblib']