In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

import sys 
sys.path.append('../')
import src.model.feature_cleaning as feature_cleaning

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


#sklearn models
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
#sklearn other
import graphviz 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, f1_score, log_loss, accuracy_score
from mlxtend.plotting import plot_confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

  from numpy.core.umath_tests import inner1d


In [2]:
df, fieldofdegree_df, SOCP_labels, schl_labels, major_majors, NAICSP_labels_df, MAJ_NAICSP_labels_df = feature_cleaning.load_dfs()

In [3]:
youngemp_df = feature_cleaning.clean_that_target(df, SOCP_labels)
youngemp_df = feature_cleaning.single_occ_target(youngemp_df)
edu_df = feature_cleaning.create_edu_df(youngemp_df, fieldofdegree_df, schl_labels, major_majors)

  SOCPdf = df.dropna(axis='index', subset=['SOCP'])[df.SOCP != '999920']


Number of employed people: 218454
Percent employed people: 0.5785711448056677
Number of young employed people: 77406
Percent young employed people(out of all PUMS): 0.20500827650135733
Number of emp cats: 23
Number of degree fields present (max 173): 173


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  edu_df['SCHL_labels'] = edu_df.SCHL.map(schl_labels)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  edu_df['SCHL_ord'] = edu_df.SCHL.astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats

before dummies:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 77406 entries, 0 to 77405
Data columns (total 14 columns):
SERIALNO            77406 non-null int64
SOCP                77406 non-null object
MAJ_SOCP            77406 non-null object
MAJ_SOCP_labels     77406 non-null object
MAJ_SOCP_15         77406 non-null int64
FOD1P               77406 non-null object
FOD2P               77406 non-null object
FOD1P_labels        77406 non-null object
FOD2P_labels        77406 non-null object
SCHL                77406 non-null object
SCHL_labels         77406 non-null object
SCHL_ord            77406 non-null int64
FOD1P_MAJ           77406 non-null int64
FOD1P_MAJ_labels    77406 non-null object
dtypes: int64(4), object(10)
memory usage: 53.1 MB
None


In [None]:
edu_df.sample(3)

In [4]:
# split the data
X = edu_df.drop(columns=[ 'SERIALNO', 'FOD1P', 'FOD2P','SOCP','MAJ_SOCP','MAJ_SOCP_labels', 
                'MAJ_SOCP_15','FOD1P_labels','FOD2P_labels','SCHL',
                'SCHL_labels','FOD1P_MAJ_labels', 'FOD1P_MAJ'])
y = edu_df.loc[:,'MAJ_SOCP_15']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3, 
                                                    random_state=42)

In [None]:
X_train.shape

In [None]:
X_train.sample(5)

In [None]:
y_train.shape

In [None]:

# model pipelines
#-----------------------------------
pipe_lr_pca = Pipeline([('scl', StandardScaler()),
			('pca', PCA(n_components=2)),
			('clf', LogisticRegression(random_state=42))])

pipe_sgd_pca = Pipeline([('scl', StandardScaler()),
			('pca', PCA(n_components=2)),
			('clf', SGDClassifier(random_state=42))])

pipe_rf_pca = Pipeline([('scl', StandardScaler()),
			('pca', PCA(n_components=2)),
			('clf', RandomForestClassifier(random_state=42))])

pipe_svm_pca = Pipeline([('scl', StandardScaler()),
			('pca', PCA(n_components=2)),
			('clf', SVC(random_state=42))])

#-----------------------------------

In [None]:

# grid search params
param_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
param_range_fl = [1.0, 0.5, 0.1]
max_depth = [10,100,1000,10000]
alpha_range = [.1, .001, .00001, .000001]
gamma_range = [.1, 1, 10]

#-------------linear
grid_params_lr = [{'clf__penalty': ['l1'],
		'clf__C': param_range_fl,
		'clf__solver': ['liblinear', ],  #,'saga'
        #'clf__multi_class': ['ovr', 'multinomial', 'auto'],
        'clf__class_weight': [None, 'balanced']}] 

grid_params_lr_l2 = [{'clf__penalty': ['l2'],
		'clf__C': param_range_fl,
		'clf__solver': ['newton-cg', 'lbfgs', 'liblinear'],  #, 'sag'
        #'clf__multi_class': ['ovr', 'multinomial', 'auto'],
        'clf__class_weight': [None, 'balanced']}]

grid_params_sgd = [{'clf__loss': ['hinge', 'log', 'perceptron'],
		'clf__alpha': alpha_range,
		'clf__penalty': ['l1', 'l2', 'elasticnet'],
        'clf__class_weight': [None, 'balanced']}] 

#-------------trees
grid_params_dt = [{'clf__criterion': ['gini', 'entropy'],
		'clf__min_samples_leaf': param_range,
		'clf__max_depth': max_depth,
		'clf__min_samples_split': param_range[1:],
        'clf__class_weight': [None, 'balanced']}]

grid_params_rf = [{'clf__criterion': ['gini', 'entropy'],
		'clf__min_samples_leaf': param_range,
		'clf__max_depth': max_depth,
		'clf__min_samples_split': param_range[1:],
        'clf__class_weight': [None, 'balanced', 'balanced_subsample']}]

grid_params_gb = [{'clf__loss': ['deviance', 'exponential'],
		'clf__learning_rate': alpha_range,
		'clf__n_estimators': max_depth,
		'clf__subsample': param_range_fl}]

#-------------SVM
grid_params_svm = [{'clf__kernel': ['linear', 'rbf', 'poly'],
        'clf__degree': param_range[1:],
        'clf__gamma': gamma_range,
        'clf__C': gamma_range,
        'clf__class_weight': [None, 'balanced']}]

#-------------KNN
grid_params_knn = [{'clf__n_neighbors': param_range}]

In [None]:
# Construct grid searches
jobs = -1
verbose = 10

gs_lr_pca = GridSearchCV(estimator=pipe_lr_pca,
			param_grid=grid_params_lr,
			scoring='f1_micro',
			cv=10,
            verbose=verbose)

gs_lr_pca_l2 = GridSearchCV(estimator=pipe_lr_pca,
			param_grid=grid_params_lr_l2,
			scoring='f1_micro',
			cv=10,
            verbose=verbose)

gs_sgd_pca = GridSearchCV(estimator=pipe_sgd_pca,
			param_grid=grid_params_sgd,
			scoring='f1_micro',
			cv=10,
            verbose=verbose)

gs_rf_pca = GridSearchCV(estimator=pipe_rf_pca,
			param_grid=grid_params_rf,
			scoring='f1_micro',
			cv=10, 
			n_jobs=jobs,
            verbose=verbose)

gs_svm_pca = GridSearchCV(estimator=pipe_svm_pca,
			param_grid=grid_params_svm,
			scoring='f1_micro',
			cv=10,
			n_jobs=jobs,
            verbose=verbose)


In [None]:
# List of pipelines for ease of iteration
grids = [gs_lr_pca, gs_lr_pca_l2, gs_sgd_pca, gs_rf_pca, gs_svm_pca]

# Dictionary of pipelines and classifier types for ease of reference
grid_dict = {0: 'Logistic Regression w/ L1 pca', 1: 'LogisticRegression w/ L2 pca', 2: 'SGDClassifier pca', 
             3: 'Random Forest pca scaling', 4:'SVC'}

# Fit the grid search objects
print('Performing model optimizations...')
best_f1_micro = 0.0
best_clf = 0
best_gs = ''
for idx, gs in enumerate(grids):
	print('\nEstimator: %s' % grid_dict[idx])
	# Fit grid search
	gs.fit(X_train, y_train)
    
	# Best params
	print('Best params: %s' % gs.best_params_)
    
	# Best training data f1
	print('Best training f1: %.3f' % gs.best_score_)
    
	# Predict on test data with best params
	y_pred = gs.predict(X_test)
    
	# Test data accuracy of model with best params
	print('Test set f1 score for best params: %.3f ' % f1_score(y_test, y_pred))
    
	# Track best (highest test f1) model
	if f1_score(y_test, y_pred) > best_f1_micro:
		best_f1_micro = f1_score(y_test, y_pred)
		best_gs = gs
		best_clf = idx
print('\nClassifier with best test set f1: %s' % grid_dict[best_clf])

# Save best grid search pipeline to file
dump_file = 'best_model_no_feat_sel_extr_occ_15.pkl'
joblib.dump(best_gs, dump_file, compress=1)
print('\nSaved %s grid search pipeline to file: %s' % (grid_dict[best_clf], dump_file))

In [5]:
y_pred_mostly_0 = np.zeros((len(y_train)-1,))
y_pred_mostly_0.shape

(54183,)

In [6]:
y_pred_mostly_0 = np.append(y_pred_mostly_0, [1,])

In [7]:
y_pred_mostly_0.shape

(54184,)

In [8]:
f1_score(y_train, y_pred_mostly_0)

0.0

In [11]:
accuracy_score(y_train, y_pred_all_0)

0.9640853388454156

In [9]:
accuracy_score(y_train, y_pred_mostly_0)

0.9640668832127566

In [None]:
#trying to test if loading cleaned df will work in ec2

In [15]:
edu_df = pd.read_csv('edu_df_15.csv')

# split the data, choosing only edu cols
X = edu_df.drop(columns=['Unnamed: 0', 'SERIALNO', 'FOD1P', 'FOD2P','SOCP','MAJ_SOCP','MAJ_SOCP_labels', 
                'MAJ_SOCP_15','FOD1P_labels','FOD2P_labels','SCHL',
                'SCHL_labels','FOD1P_MAJ_labels', 'FOD1P_MAJ'])
y = edu_df.loc[:,'MAJ_SOCP_15']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3, 
                                                    random_state=42)

In [17]:
X.head()


Unnamed: 0,SCHL_ord,"FOD1P_MAJ__Agriculture, agriculture operations, and related sciences",FOD1P_MAJ__Architecture and related services,"FOD1P_MAJ__Area, ethnic, cultural, gender, and group studies",FOD1P_MAJ__Biological and biomedical sciences,"FOD1P_MAJ__Business, management, marketing, and related support services","FOD1P_MAJ__Communication, journalism, and related programs",FOD1P_MAJ__Communications technologies/technicians and support services,FOD1P_MAJ__Computer and information sciences and support services,FOD1P_MAJ__Construction trades,FOD1P_MAJ__Education,FOD1P_MAJ__Engineering,FOD1P_MAJ__Engineering technologies and engineering-related fields,FOD1P_MAJ__English language and literature/letters,FOD1P_MAJ__Family and consumer sciences/human sciences,"FOD1P_MAJ__Foreign languages, literatures, and linguistics",FOD1P_MAJ__Health professions and related programs,FOD1P_MAJ__History,"FOD1P_MAJ__Homeland security, law enforcement, firefighting and related protective services",FOD1P_MAJ__Legal professions and studies,"FOD1P_MAJ__Liberal arts and sciences, general studies and humanities",FOD1P_MAJ__Library science,FOD1P_MAJ__Mathematics and statistics,FOD1P_MAJ__Mechanic and repair technologies/technicians,"FOD1P_MAJ__Military science, leadership and operational art",FOD1P_MAJ__Multi/interdisciplinary studies,FOD1P_MAJ__Natural resources and conservation,FOD1P_MAJ__No major,"FOD1P_MAJ__Parks, recreation, leisure, and fitness studies",FOD1P_MAJ__Personal and culinary services,FOD1P_MAJ__Philosophy and religious studies,FOD1P_MAJ__Physical sciences,FOD1P_MAJ__Psychology,FOD1P_MAJ__Public administration and social service professions,FOD1P_MAJ__Science technologies/technicians,FOD1P_MAJ__Social sciences,FOD1P_MAJ__Theology and religious vocations,FOD1P_MAJ__Transportation and materials moving,FOD1P_MAJ__Visual and performing arts,FOD1P__Accounting,FOD1P__Actuarial Science,FOD1P__Advertising And Public Relations,FOD1P__Aerospace Engineering,FOD1P__Agricultural Economics,FOD1P__Agriculture Production And Management,FOD1P__Animal Sciences,FOD1P__Anthropology And Archeology,FOD1P__Applied Mathematics,FOD1P__Architectural Engineering,FOD1P__Architecture,FOD1P__Area Ethnic And Civilization Studies,FOD1P__Art And Music Education,FOD1P__Art History And Criticism,FOD1P__Astronomy And Astrophysics,FOD1P__Atmospheric Sciences And Meteorology,FOD1P__Biochemical Sciences,FOD1P__Biological Engineering,FOD1P__Biology,FOD1P__Biomedical Engineering,FOD1P__Botany,FOD1P__Business Economics,FOD1P__Business Management And Administration,FOD1P__Chemical Engineering,FOD1P__Chemistry,FOD1P__Civil Engineering,FOD1P__Clinical Psychology,FOD1P__Cognitive Science And Biopsychology,FOD1P__Commercial Art And Graphic Design,FOD1P__Communication Disorders Sciences And Services,FOD1P__Communication Technologies,FOD1P__Communications,FOD1P__Community And Public Health,FOD1P__Composition And Rhetoric,FOD1P__Computer Administration Management And Security,FOD1P__Computer And Information Systems,FOD1P__Computer Engineering,FOD1P__Computer Networking And Telecommunications,FOD1P__Computer Programming And Data Processing,FOD1P__Computer Science,FOD1P__Construction Services,FOD1P__Cosmetology Services And Culinary Arts,FOD1P__Counseling Psychology,FOD1P__Court Reporting,FOD1P__Criminal Justice And Fire Protection,FOD1P__Criminology,FOD1P__Drama And Theater Arts,FOD1P__Early Childhood Education,FOD1P__Ecology,FOD1P__Economics,FOD1P__Educational Administration And Supervision,FOD1P__Educational Psychology,FOD1P__Electrical Engineering,FOD1P__Electrical Engineering Technology,"FOD1P__Electrical, Mechanical, And Precision Technologies And Production",FOD1P__Elementary Education,FOD1P__Engineering And Industrial Management,FOD1P__Engineering Mechanics Physics And Science,FOD1P__Engineering Technologies,FOD1P__English Language And Literature,FOD1P__Environmental Engineering,FOD1P__Environmental Science,FOD1P__Family And Consumer Sciences,FOD1P__Film Video And Photographic Arts,FOD1P__Finance,FOD1P__Fine Arts,FOD1P__Food Science,FOD1P__Forestry,FOD1P__French German Latin And Other Common Foreign Language Studies,FOD1P__General Agriculture,FOD1P__General Business,FOD1P__General Education,FOD1P__General Engineering,FOD1P__General Medical And Health Services,FOD1P__General Social Sciences,FOD1P__Genetics,FOD1P__Geography,FOD1P__Geological And Geophysical Engineering,FOD1P__Geology And Earth Science,FOD1P__Geosciences,FOD1P__Health And Medical Administrative Services,FOD1P__Health And Medical Preparatory Programs,FOD1P__History,FOD1P__Hospitality Management,FOD1P__Human Resources And Personnel Management,FOD1P__Human Services And Community Organization,FOD1P__Humanities,FOD1P__Industrial And Manufacturing Engineering,FOD1P__Industrial And Organizational Psychology,FOD1P__Industrial Production Technologies,FOD1P__Information Sciences,FOD1P__Intercultural And International Studies,FOD1P__Interdisciplinary Social Sciences,FOD1P__International Business,FOD1P__International Relations,FOD1P__Journalism,FOD1P__Language And Drama Education,FOD1P__Liberal Arts,FOD1P__Library Science,FOD1P__Linguistics And Comparative Language And Literature,FOD1P__Management Information Systems And Statistics,FOD1P__Marketing And Marketing Research,FOD1P__Mass Media,FOD1P__Materials Engineering And Materials Science,FOD1P__Materials Science,FOD1P__Mathematics,FOD1P__Mathematics And Computer Science,FOD1P__Mathematics Teacher Education,FOD1P__Mechanical Engineering,FOD1P__Mechanical Engineering Related Technologies,FOD1P__Medical Assisting Services,FOD1P__Medical Technologies Technicians,FOD1P__Metallurgical Engineering,FOD1P__Microbiology,FOD1P__Military Technologies,FOD1P__Mining And Mineral Engineering,FOD1P__Miscellaneous Agriculture,FOD1P__Miscellaneous Biology,FOD1P__Miscellaneous Business & Medical Administration,FOD1P__Miscellaneous Education,FOD1P__Miscellaneous Engineering,FOD1P__Miscellaneous Engineering Technologies,FOD1P__Miscellaneous Fine Arts,FOD1P__Miscellaneous Health Medical Professions,FOD1P__Miscellaneous Psychology,FOD1P__Miscellaneous Social Sciences,FOD1P__Molecular Biology,FOD1P__Multi-Disciplinary Or General Science,FOD1P__Multi/Interdisciplinary Studies,FOD1P__Music,FOD1P__Natural Resources Management,FOD1P__Naval Architecture And Marine Engineering,FOD1P__Neuroscience,FOD1P__No major,FOD1P__Nuclear Engineering,"FOD1P__Nuclear, Industrial Radiology, And Biological Technologies",FOD1P__Nursing,FOD1P__Nutrition Sciences,FOD1P__Oceanography,FOD1P__Operations Logistics And E-Commerce,FOD1P__Other Foreign Languages,FOD1P__Petroleum Engineering,FOD1P__Pharmacology,FOD1P__Pharmacy Pharmaceutical Sciences And Administration,FOD1P__Philosophy And Religious Studies,FOD1P__Physical And Health Education Teaching,FOD1P__Physical Fitness Parks Recreation And Leisure,FOD1P__Physical Sciences,FOD1P__Physics,FOD1P__Physiology,FOD1P__Plant Science And Agronomy,FOD1P__Political Science And Government,FOD1P__Pre-Law And Legal Studies,FOD1P__Psychology,FOD1P__Public Administration,FOD1P__Public Policy,FOD1P__School Student Counseling,FOD1P__Science And Computer Teacher Education,FOD1P__Secondary Teacher Education,FOD1P__Social Psychology,FOD1P__Social Science Or History Teacher Education,FOD1P__Social Work,FOD1P__Sociology,FOD1P__Soil Science,FOD1P__Special Needs Education,FOD1P__Statistics And Decision Science,FOD1P__Studio Arts,FOD1P__Teacher Education: Multiple Levels,FOD1P__Theology And Religious Vocations,FOD1P__Transportation Sciences And Technologies,FOD1P__Treatment Therapy Professions,FOD1P__United States History,FOD1P__Visual And Performing Arts,FOD1P__Zoology,FOD2P__Accounting,FOD2P__Advertising And Public Relations,FOD2P__Aerospace Engineering,FOD2P__Agriculture Production And Management,FOD2P__Animal Sciences,FOD2P__Anthropology And Archeology,FOD2P__Applied Mathematics,FOD2P__Architecture,FOD2P__Area Ethnic And Civilization Studies,FOD2P__Art And Music Education,FOD2P__Art History And Criticism,FOD2P__Astronomy And Astrophysics,FOD2P__Atmospheric Sciences And Meteorology,FOD2P__Biochemical Sciences,FOD2P__Biological Engineering,FOD2P__Biology,FOD2P__Biomedical Engineering,FOD2P__Business Economics,FOD2P__Business Management And Administration,FOD2P__Chemistry,FOD2P__Civil Engineering,FOD2P__Clinical Psychology,FOD2P__Cognitive Science And Biopsychology,FOD2P__Commercial Art And Graphic Design,FOD2P__Communication Disorders Sciences And Services,FOD2P__Communication Technologies,FOD2P__Communications,FOD2P__Community And Public Health,FOD2P__Composition And Rhetoric,FOD2P__Computer Administration Management And Security,FOD2P__Computer And Information Systems,FOD2P__Computer Engineering,FOD2P__Computer Networking And Telecommunications,FOD2P__Computer Programming And Data Processing,FOD2P__Computer Science,FOD2P__Cosmetology Services And Culinary Arts,FOD2P__Counseling Psychology,FOD2P__Criminal Justice And Fire Protection,FOD2P__Criminology,FOD2P__Drama And Theater Arts,FOD2P__Early Childhood Education,FOD2P__Ecology,FOD2P__Economics,FOD2P__Educational Administration And Supervision,FOD2P__Educational Psychology,FOD2P__Electrical Engineering,FOD2P__Electrical Engineering Technology,"FOD2P__Electrical, Mechanical, And Precision Technologies And Production",FOD2P__Elementary Education,FOD2P__Engineering And Industrial Management,FOD2P__Engineering Mechanics Physics And Science,FOD2P__Engineering Technologies,FOD2P__English Language And Literature,FOD2P__Environmental Engineering,FOD2P__Environmental Science,FOD2P__Family And Consumer Sciences,FOD2P__Film Video And Photographic Arts,FOD2P__Finance,FOD2P__Fine Arts,FOD2P__Food Science,FOD2P__Forestry,FOD2P__French German Latin And Other Common Foreign Language Studies,FOD2P__General Business,FOD2P__General Education,FOD2P__General Engineering,FOD2P__General Medical And Health Services,FOD2P__General Social Sciences,FOD2P__Genetics,FOD2P__Geography,FOD2P__Geology And Earth Science,FOD2P__Geosciences,FOD2P__Health And Medical Administrative Services,FOD2P__Health And Medical Preparatory Programs,FOD2P__History,FOD2P__Hospitality Management,FOD2P__Human Resources And Personnel Management,FOD2P__Human Services And Community Organization,FOD2P__Humanities,FOD2P__Industrial And Manufacturing Engineering,FOD2P__Industrial And Organizational Psychology,FOD2P__Industrial Production Technologies,FOD2P__Information Sciences,FOD2P__Intercultural And International Studies,FOD2P__Interdisciplinary Social Sciences,FOD2P__International Business,FOD2P__International Relations,FOD2P__Journalism,FOD2P__Language And Drama Education,FOD2P__Liberal Arts,FOD2P__Library Science,FOD2P__Linguistics And Comparative Language And Literature,FOD2P__Management Information Systems And Statistics,FOD2P__Marketing And Marketing Research,FOD2P__Mass Media,FOD2P__Materials Engineering And Materials Science,FOD2P__Materials Science,FOD2P__Mathematics,FOD2P__Mathematics And Computer Science,FOD2P__Mechanical Engineering,FOD2P__Medical Assisting Services,FOD2P__Medical Technologies Technicians,FOD2P__Metallurgical Engineering,FOD2P__Microbiology,FOD2P__Miscellaneous Biology,FOD2P__Miscellaneous Business & Medical Administration,FOD2P__Miscellaneous Education,FOD2P__Miscellaneous Engineering,FOD2P__Miscellaneous Engineering Technologies,FOD2P__Miscellaneous Fine Arts,FOD2P__Miscellaneous Health Medical Professions,FOD2P__Miscellaneous Psychology,FOD2P__Miscellaneous Social Sciences,FOD2P__Molecular Biology,FOD2P__Multi-Disciplinary Or General Science,FOD2P__Multi/Interdisciplinary Studies,FOD2P__Music,FOD2P__Natural Resources Management,FOD2P__Neuroscience,FOD2P__No major,FOD2P__Nursing,FOD2P__Nutrition Sciences,FOD2P__Oceanography,FOD2P__Operations Logistics And E-Commerce,FOD2P__Other Foreign Languages,FOD2P__Pharmacy Pharmaceutical Sciences And Administration,FOD2P__Philosophy And Religious Studies,FOD2P__Physical And Health Education Teaching,FOD2P__Physical Fitness Parks Recreation And Leisure,FOD2P__Physical Sciences,FOD2P__Physics,FOD2P__Physiology,FOD2P__Plant Science And Agronomy,FOD2P__Political Science And Government,FOD2P__Pre-Law And Legal Studies,FOD2P__Psychology,FOD2P__Public Administration,FOD2P__Public Policy,FOD2P__Science And Computer Teacher Education,FOD2P__Secondary Teacher Education,FOD2P__Social Science Or History Teacher Education,FOD2P__Social Work,FOD2P__Sociology,FOD2P__Soil Science,FOD2P__Special Needs Education,FOD2P__Statistics And Decision Science,FOD2P__Studio Arts,FOD2P__Theology And Religious Vocations,FOD2P__Treatment Therapy Professions,FOD2P__United States History,FOD2P__Visual And Performing Arts,FOD2P__Zoology,"SCHL__1 or more years of college credit, no degree",SCHL__12th grade - no diploma,SCHL__Associate's degree,SCHL__Bachelor's degree,SCHL__Doctorate degree,SCHL__GED or alternative credential,SCHL__Grade 1,SCHL__Grade 10,SCHL__Grade 11,SCHL__Grade 2,SCHL__Grade 3,SCHL__Grade 4,SCHL__Grade 5,SCHL__Grade 6,SCHL__Grade 7,SCHL__Grade 8,SCHL__Grade 9,SCHL__Kindergarten,SCHL__Master's degree,SCHL__No schooling completed,"SCHL__Nursery school, preschool",SCHL__Professional degree beyond a bachelor's degree,SCHL__Regular high school diploma,"SCHL__Some college, but less than 1 year"
0,21,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,22,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,21,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [13]:
y_train

31525    0
53254    0
28462    0
48471    0
12160    0
41246    0
18057    0
45305    0
7079     1
52867    0
22128    0
43749    0
66658    0
54019    0
3094     0
70768    0
33009    0
37497    0
66153    0
43267    0
36785    0
30488    0
55613    0
18346    0
7463     0
22478    0
41927    0
57428    0
40202    0
64448    0
36758    0
18149    0
31150    0
41071    0
6072     0
38296    0
38139    0
25635    0
63170    0
13524    0
72921    0
20735    0
31779    0
49779    0
41276    0
66804    0
55663    0
56349    0
45130    1
27529    0
55287    0
27888    0
43233    0
50358    0
48475    0
32066    0
51282    0
17933    0
50608    0
11708    0
21317    0
53907    0
74418    1
49406    0
7772     0
44817    0
21595    0
6357     0
76606    0
5373     0
8479     0
56104    0
47270    0
12535    0
72251    0
60983    0
8243     0
38239    0
77036    0
4243     0
58746    0
640      0
18591    0
28489    0
53525    0
8455     0
63984    0
3493     0
7517     0
15759    0
14607    0

In [None]:
PCA






------------------








In [None]:
ssh -i ~/.ssh/first_key.pem ubuntu@ec2-54-91-234-129.compute-1.amazonaws.com
[ec2-user ~]$ git clone https://github.com/DMSaunders/capstone
#[ec2-user ~]$ wget https://s3.amazonaws.com/galv-dsi-2018-ds/psam_p06.csv