# Machine Learning

### Steps
1. Frame the Problem
2. Find and Explore Data
3. Clean and Process Data
4. Extract Features
5. Train and Test the Model
***

### 1. Frame the Problem
We need some way to (with reasonable accuracy) determine a person's gender based on their name.


A conventional approach...

In [1]:
def guess_gender(name):
    if name[-1] == "a":
        pass

Third party/library? https://gender-api.com/en/account/overview

If those don't work, you might have an ML problem

### 2. Find and Explore Data
-https://archive.ics.uci.edu/ml/datasets/Gender+by+Name

-https://www.kaggle.com/datasets/max1mum/gendername-dataset




## Read in the Raw Data

In [2]:
import csv

with open("base_data.csv") as f:
    reader = csv.DictReader(f)
    names = [(r["name"], r["gender"]) for r in reader ]
    

In [3]:
names

[('James', 'M'),
 ('John', 'M'),
 ('Robert', 'M'),
 ('Michael', 'M'),
 ('William', 'M'),
 ('Mary', 'F'),
 ('David', 'M'),
 ('Joseph', 'M'),
 ('Richard', 'M'),
 ('Charles', 'M'),
 ('Thomas', 'M'),
 ('Christopher', 'M'),
 ('Daniel', 'M'),
 ('Matthew', 'M'),
 ('Elizabeth', 'F'),
 ('Patricia', 'F'),
 ('Jennifer', 'F'),
 ('Anthony', 'M'),
 ('George', 'M'),
 ('Linda', 'F'),
 ('Barbara', 'F'),
 ('Donald', 'M'),
 ('Paul', 'M'),
 ('Mark', 'M'),
 ('Andrew', 'M'),
 ('Steven', 'M'),
 ('Kenneth', 'M'),
 ('Edward', 'M'),
 ('Joshua', 'M'),
 ('Margaret', 'F'),
 ('Brian', 'M'),
 ('Kevin', 'M'),
 ('Jessica', 'F'),
 ('Sarah', 'F'),
 ('Susan', 'F'),
 ('Timothy', 'M'),
 ('Dorothy', 'F'),
 ('Jason', 'M'),
 ('Ronald', 'M'),
 ('Helen', 'F'),
 ('Ryan', 'M'),
 ('Jeffrey', 'M'),
 ('Karen', 'F'),
 ('Nancy', 'F'),
 ('Betty', 'F'),
 ('Lisa', 'F'),
 ('Jacob', 'M'),
 ('Nicholas', 'M'),
 ('Ashley', 'F'),
 ('Eric', 'M'),
 ('Frank', 'M'),
 ('Gary', 'M'),
 ('Anna', 'F'),
 ('Stephen', 'M'),
 ('Jonathan', 'M'),
 ('Sandra',

### 3. Clean and Process Data

What information do we really need?

Can we simplify or regularize our data?

How can we organize and prepare our data for further processing?


In [4]:
# lowercase names, convert categories to numbers, add "features" for later use
## result: Array of Tuples [ (name, gender, {"features": []}) ]
cleaned_names = [ (t[0].lower(), 0 if t[1] == "M" else 1, {"features": []}) for t in names]

In [5]:
cleaned_names

[('james', 0, {'features': []}),
 ('john', 0, {'features': []}),
 ('robert', 0, {'features': []}),
 ('michael', 0, {'features': []}),
 ('william', 0, {'features': []}),
 ('mary', 1, {'features': []}),
 ('david', 0, {'features': []}),
 ('joseph', 0, {'features': []}),
 ('richard', 0, {'features': []}),
 ('charles', 0, {'features': []}),
 ('thomas', 0, {'features': []}),
 ('christopher', 0, {'features': []}),
 ('daniel', 0, {'features': []}),
 ('matthew', 0, {'features': []}),
 ('elizabeth', 1, {'features': []}),
 ('patricia', 1, {'features': []}),
 ('jennifer', 1, {'features': []}),
 ('anthony', 0, {'features': []}),
 ('george', 0, {'features': []}),
 ('linda', 1, {'features': []}),
 ('barbara', 1, {'features': []}),
 ('donald', 0, {'features': []}),
 ('paul', 0, {'features': []}),
 ('mark', 0, {'features': []}),
 ('andrew', 0, {'features': []}),
 ('steven', 0, {'features': []}),
 ('kenneth', 0, {'features': []}),
 ('edward', 0, {'features': []}),
 ('joshua', 0, {'features': []}),
 ('ma

### 4. Extract Features

Everything has to end up as a number
Model features as a pipeline

In [6]:
# takes a tuple, returns a tuple
def last_char_is_a(t):
    if t[0][-1] == "a":
        t[2]["features"].append(1)
    else:
        t[2]["features"].append(0)
    return t
    


In [7]:
def last_letter(t):
    vowel_converter = {
        "a": 0,
        "e": 1,
        "i": 2,
        "o": 3,
        "u": 4,
        "y": 5
    }
    one_hot = [0] * 6
    # ex: [1,0,0,0,0,0] == "a"
    #     [0,1,0,0,0,0] == "e"
    
    last_char = t[0][-1]
    vowel_value = vowel_converter.get(last_char, None)
    if vowel_value:
        one_hot[vowel_value] = 1
    t[2]["features"].append(one_hot)
    return t

In [8]:
def name_length(t):
    name = t[0]
    t[2]["features"].append(len(name))
    return t

In [9]:
def all_letters(t):
    name = t[0]
    from collections import Counter
    c = list(Counter(list(name)).keys())
    base = [x for x in range(97,123)]
    one_hot = [1 if chr(x) in c else 0 for x in base]
    t[2]["features"].append(one_hot)
    return t

In [10]:
def flatten_features(t):
    flatten_list = lambda irregular_list:[element for item in irregular_list for element in flatten_list(item)] if type(irregular_list) is list else [irregular_list]
    t[2]["features"] = flatten_list(t[2]["features"])
    return t

### Run the Pipeline to Extract Features

In [11]:


from functools import reduce

def apply(p, func):
    return func(p)

pipeline = [last_char_is_a, last_letter, name_length, all_letters, flatten_features] 


cleaned_with_features = [reduce(lambda p, func: func(p), pipeline, t) for t in cleaned_names]


In [12]:
len(cleaned_with_features)

147269

### 5. Train and Test the Model

In [13]:
# Create a Training set and a Testing Set

import random

random.shuffle(cleaned_with_features)

train_set = cleaned_with_features[:100000]
test_set = cleaned_with_features[100000:120000]


import numpy as np

train_set_features = [np.array(v[2]["features"]) for v in train_set]
train_set_labels = [np.array(v[1]) for v in train_set]

test_set_features = [np.array(v[2]["features"]) for v in test_set]
test_set_labels = [np.array(v[1]) for v in test_set]

In [14]:
train_set_labels

[array(1),
 array(0),
 array(1),
 array(0),
 array(1),
 array(0),
 array(1),
 array(1),
 array(0),
 array(0),
 array(0),
 array(1),
 array(1),
 array(0),
 array(0),
 array(0),
 array(0),
 array(1),
 array(1),
 array(0),
 array(0),
 array(0),
 array(0),
 array(1),
 array(1),
 array(0),
 array(1),
 array(0),
 array(1),
 array(1),
 array(1),
 array(1),
 array(1),
 array(1),
 array(1),
 array(1),
 array(0),
 array(1),
 array(0),
 array(0),
 array(0),
 array(1),
 array(0),
 array(1),
 array(1),
 array(0),
 array(1),
 array(0),
 array(0),
 array(0),
 array(1),
 array(0),
 array(0),
 array(1),
 array(1),
 array(1),
 array(1),
 array(0),
 array(0),
 array(1),
 array(0),
 array(1),
 array(0),
 array(1),
 array(0),
 array(0),
 array(1),
 array(1),
 array(0),
 array(1),
 array(1),
 array(1),
 array(1),
 array(1),
 array(1),
 array(0),
 array(1),
 array(0),
 array(1),
 array(1),
 array(1),
 array(1),
 array(1),
 array(0),
 array(1),
 array(0),
 array(1),
 array(1),
 array(1),
 array(1),
 array(1),

In [15]:
cleaned_with_features[0][2]

{'features': [0,
  0,
  1,
  0,
  0,
  0,
  0,
  10,
  1,
  0,
  1,
  0,
  1,
  0,
  0,
  1,
  1,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0]}

In [16]:
from sklearn.naive_bayes import GaussianNB
clf_gaussianNB = GaussianNB()
clf_gaussianNB.fit(train_set_features, train_set_labels)

from sklearn.tree import DecisionTreeClassifier
clf_decTree = DecisionTreeClassifier()
clf_decTree.fit(train_set_features, train_set_labels)

from sklearn.linear_model import RidgeClassifier
clf_RidgeClassifier = RidgeClassifier()
clf_RidgeClassifier.fit(train_set_features, train_set_labels)



RidgeClassifier()

In [17]:


print("GaussianNB: ", clf_gaussianNB.score(test_set_features, test_set_labels))
print("DecisionTreeClassifier: ", clf_decTree.score(test_set_features, test_set_labels))
print("RidgeClassifier: ", clf_RidgeClassifier.score(test_set_features, test_set_labels))

GaussianNB:  0.6899
DecisionTreeClassifier:  0.6924
RidgeClassifier:  0.74655


## Save the Model

In [18]:
import pickle

with open("chosen_model.pickle", "wb") as f:
    pickle.dump(clf_RidgeClassifier, f)

## Test the Model on Out of Sample Data (sanity check)

In [19]:

with open("oos_names.csv") as f:
    reader = csv.DictReader(f)
    oos_names = [(r["name"], r["gender"]) for r in reader ]
    
oos_names = [(t[0].lower(), 0 if t[1] == "M" else 1, {"features": []}) for t in oos_names]
oos_names = [reduce(lambda p, func: func(p), pipeline, t) for t in oos_names]
oos_names_features = [np.array(v[2]["features"]) for v in oos_names]
oos_names_categories = [np.array(v[1]) for v in oos_names]

In [20]:
with open("chosen_model.pickle", "rb") as f:
     chosen_model = pickle.load(f)
print("Sanity Test: ", chosen_model.score(oos_names_features, oos_names_categories))
print(chosen_model.predict(oos_names_features))
print(oos_names_categories)


Sanity Test:  0.8
[0 0 0 0 0 0 1 0 1 1]
[array(0), array(0), array(0), array(0), array(0), array(1), array(1), array(1), array(1), array(1)]


In [21]:
chosen_model.classes_
chosen_model.n_features_in_

34