In [1]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC

import pandas as pd
import numpy, string
import matplotlib
import scipy


## Load Data ##

In [2]:
df = pd.read_csv("MOCK_DATA.csv")
df.head()

Unnamed: 0,id,first_name,last_name,email,gender,ip_address,phone,address,ssn
0,1,Cornell,Nye,cnye0@sciencedirect.com,Male,171.95.103.238,128-612-4011,86 Mcbride Street,608-45-8721
1,2,Joseph,Deverill,jdeverill1@whitehouse.gov,Male,173.130.94.151,278-837-7782,6620 Clarendon Road,409-65-3396
2,3,Pegeen,Lerego,plerego2@amazon.co.jp,Female,108.48.103.4,353-423-4599,5711 Troy Junction,204-67-8040
3,4,Janith,Panons,jpanons3@berkeley.edu,Female,34.27.99.66,535-128-9172,32 Manley Alley,396-47-0769
4,5,Cissiee,Myhill,cmyhill4@newyorker.com,Female,28.172.195.129,717-836-1731,750 Corben Junction,691-91-8581


Transpose the data so that we can classify each column from original dataset. Also, add a value that contains the category of data that the row contains.

In [3]:
df = df.transpose()
df['category'] = df.index
df = df.drop('id')
df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1991,1992,1993,1994,1995,1996,1997,1998,1999,category
first_name,Cornell,Joseph,Pegeen,Janith,Cissiee,Berty,Thelma,Roze,Laurena,Tris,...,Rhetta,Quill,Mignon,Denna,Dasi,Lance,Jody,Ron,Benedicto,first_name
last_name,Nye,Deverill,Lerego,Panons,Myhill,Itzkovwitch,Kynnd,Lanktree,Wakeman,Blaske,...,Shrimptone,Cruwys,Brampton,Tussaine,Frean,Zienkiewicz,O'Hone,Divall,Partener,last_name
email,cnye0@sciencedirect.com,jdeverill1@whitehouse.gov,plerego2@amazon.co.jp,jpanons3@berkeley.edu,cmyhill4@newyorker.com,bitzkovwitch5@globo.com,tkynnd6@omniture.com,rlanktree7@theatlantic.com,lwakeman8@businessinsider.com,tblaske9@dion.ne.jp,...,rshrimptonerj@sphinn.com,qcruwysrk@illinois.edu,mbramptonrl@phpbb.com,dtussainerm@etsy.com,dfreanrn@topsy.com,lzienkiewiczro@phpbb.com,johonerp@squidoo.com,rdivallrq@webeden.co.uk,bpartenerrr@last.fm,email
gender,Male,Male,Female,Female,Female,Male,Female,Female,Female,Male,...,Female,Male,Female,Female,Female,Male,Female,Male,Male,gender
ip_address,171.95.103.238,173.130.94.151,108.48.103.4,34.27.99.66,28.172.195.129,187.4.247.67,69.161.64.47,23.124.48.207,242.19.17.111,237.41.32.171,...,182.251.90.108,11.213.152.71,250.197.218.243,157.171.135.231,110.140.197.129,137.211.56.40,165.103.7.160,235.121.72.181,168.40.62.253,ip_address
phone,128-612-4011,278-837-7782,353-423-4599,535-128-9172,717-836-1731,232-641-2554,961-528-1763,404-540-3686,900-866-9467,582-157-5830,...,559-992-7552,921-201-0213,639-710-4689,296-586-9865,827-635-7555,901-524-0024,889-755-8886,299-231-7780,556-471-2895,phone
address,86 Mcbride Street,6620 Clarendon Road,5711 Troy Junction,32 Manley Alley,750 Corben Junction,94120 Ilene Avenue,6553 Farwell Pass,8 Lerdahl Terrace,07324 Texas Pass,8 Rigney Parkway,...,5913 Elmside Way,6413 Coleman Trail,9 Utah Center,2 Kenwood Terrace,156 Evergreen Trail,50 North Center,35 Kingsford Terrace,84073 Carberry Alley,3 Holy Cross Hill,address
ssn,608-45-8721,409-65-3396,204-67-8040,396-47-0769,691-91-8581,362-44-7446,223-43-4071,274-21-5681,863-58-5434,775-29-5872,...,557-91-1165,274-80-0533,569-86-1591,465-29-4146,197-67-3905,389-52-6103,612-74-6131,162-84-8141,245-65-2069,ssn


Transform dataframe so that each row is ('category', 'value')

In [4]:
df = df.melt(id_vars=['category'])
df.head(10)

Unnamed: 0,category,variable,value
0,first_name,0,Cornell
1,last_name,0,Nye
2,email,0,cnye0@sciencedirect.com
3,gender,0,Male
4,ip_address,0,171.95.103.238
5,phone,0,128-612-4011
6,address,0,86 Mcbride Street
7,ssn,0,608-45-8721
8,first_name,1,Joseph
9,last_name,1,Deverill


Extract X, Y from dataframe and create a train/test split. Note that since X contains strings, we need to vectorize them. To do this, I used sklearn's HashingVectorizer. Then the train/test split is made using the transformed X(value) and the original Y(category) from the dataframe.

In [47]:
hash_vect = HashingVectorizer()
X = hash_vect.fit_transform(df['value'])
y = df['category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

Now we create the LinearSVC model and fit the data to the model.

In [48]:
clf = LinearSVC()
clf.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [49]:
clf.score(X_test, y_test)

0.8558712121212121