In [1]:
import pandas as pd
import numpy as np
from kmodes.kmodes import KModes
import plotly.graph_objects as go
from sklearn.manifold import TSNE

%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns


from sklearn.externals import joblib

pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', -1)  # or 199

In [2]:
df = pd.read_csv(r"Data\kaggle-survey-DS-and-ML-2018\multipleChoiceResponses.csv" ,
                 low_memory= False )

In [3]:
cols = df.columns
print(cols)

# drop other_text cols --- not relevant
other_text_cols = []
for col in cols:
    if "OTHER" in col or "TEXT" in col:
        other_text_cols.append(col)

print(other_text_cols)
df = df.drop(other_text_cols, axis = 1)
        

Index(['Time from Start to Finish (seconds)', 'Q1', 'Q1_OTHER_TEXT', 'Q2',
       'Q3', 'Q4', 'Q5', 'Q6', 'Q6_OTHER_TEXT', 'Q7',
       ...
       'Q49_OTHER_TEXT', 'Q50_Part_1', 'Q50_Part_2', 'Q50_Part_3',
       'Q50_Part_4', 'Q50_Part_5', 'Q50_Part_6', 'Q50_Part_7', 'Q50_Part_8',
       'Q50_OTHER_TEXT'],
      dtype='object', length=395)
['Q1_OTHER_TEXT', 'Q6_OTHER_TEXT', 'Q7_OTHER_TEXT', 'Q11_OTHER_TEXT', 'Q12_Part_1_TEXT', 'Q12_Part_2_TEXT', 'Q12_Part_3_TEXT', 'Q12_Part_4_TEXT', 'Q12_Part_5_TEXT', 'Q12_OTHER_TEXT', 'Q13_OTHER_TEXT', 'Q14_OTHER_TEXT', 'Q15_OTHER_TEXT', 'Q16_OTHER_TEXT', 'Q17_OTHER_TEXT', 'Q18_OTHER_TEXT', 'Q19_OTHER_TEXT', 'Q20_OTHER_TEXT', 'Q21_OTHER_TEXT', 'Q22_OTHER_TEXT', 'Q27_OTHER_TEXT', 'Q28_OTHER_TEXT', 'Q29_OTHER_TEXT', 'Q30_OTHER_TEXT', 'Q31_OTHER_TEXT', 'Q32_OTHER', 'Q33_OTHER_TEXT', 'Q34_OTHER_TEXT', 'Q35_OTHER_TEXT', 'Q36_OTHER_TEXT', 'Q37_OTHER_TEXT', 'Q38_OTHER_TEXT', 'Q42_OTHER_TEXT', 'Q49_OTHER_TEXT', 'Q50_OTHER_TEXT']


In [4]:
print(df.columns)
# print shape 
print(df.shape)
# Check the null values
print(df.isnull().sum().sort_values(ascending = False))

Index(['Time from Start to Finish (seconds)', 'Q1', 'Q2', 'Q3', 'Q4', 'Q5',
       'Q6', 'Q7', 'Q8', 'Q9',
       ...
       'Q49_Part_11', 'Q49_Part_12', 'Q50_Part_1', 'Q50_Part_2', 'Q50_Part_3',
       'Q50_Part_4', 'Q50_Part_5', 'Q50_Part_6', 'Q50_Part_7', 'Q50_Part_8'],
      dtype='object', length=360)
(23860, 360)
Q38_Part_19                            23859
Q38_Part_20                            23859
Q28_Part_22                            23842
Q28_Part_24                            23828
Q29_Part_16                            23821
Q29_Part_25                            23806
Q29_Part_14                            23800
Q30_Part_15                            23798
Q29_Part_24                            23798
Q29_Part_23                            23793
Q30_Part_21                            23792
Q29_Part_17                            23788
Q30_Part_23                            23778
Q28_Part_18                            23775
Q28_Part_33                            23771
Q29

In [5]:
df1 =  df

In [41]:
# keep all the columns and questions together
all_cols = df1.columns.tolist()
all_questions = df1.iloc[ :1 , : ]
#print(all_questions)

In [42]:
question_group_with_parts = []

for i in range(11,51):
    temp_list = []
    for col in all_cols:
        if "Part" in col:
            if "Q"+str(i) in col:
                temp_list.append(col)
        
    question_group_with_parts.append(temp_list)

question_group_with_parts = [x for x in question_group_with_parts if x != []]



In [43]:
qs = []
for listx in question_group_with_parts:
    for col in listx[:1]:
        qs.append(df[[col]].loc[ :0 , : ])

# remove parts column from the all cols list
cols_without_parts = all_cols
for listx in question_group_with_parts:
    for col in listx:
        cols_without_parts.remove(col)
    
for col in cols_without_parts:
    qs.append(df[[col]].loc[ :0 , : ])




In [44]:
for ele in qs:
    print("\n")
    print(ele)



                                                                                                                                                                                      Q11_Part_1
0  Select any activities that make up an important part of your role at work: (Select all that apply) - Selected Choice - Analyze and understand data to influence product or business decisions


                                                                                                                                                                            Q13_Part_1
0  Which of the following integrated development environments (IDE's) have you used at work or school in the last 5 years? (Select all that apply) - Selected Choice - Jupyter/IPython


                                                                                                                                                Q14_Part_1
0  Which of the following hosted notebooks have you used at work or school in the last 

In [28]:
type(qs)

list

In [11]:
# drop the question row; which is not acutally data values
df1 = df1.iloc[ 1: , : ]
print(df1.shape)

(23859, 360)


In [12]:
df1.head(5)

Unnamed: 0,Time from Start to Finish (seconds),Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Q11_Part_1,Q11_Part_2,Q11_Part_3,Q11_Part_4,Q11_Part_5,Q11_Part_6,Q11_Part_7,Q12_MULTIPLE_CHOICE,Q13_Part_1,Q13_Part_2,Q13_Part_3,Q13_Part_4,Q13_Part_5,Q13_Part_6,Q13_Part_7,Q13_Part_8,Q13_Part_9,Q13_Part_10,Q13_Part_11,Q13_Part_12,Q13_Part_13,Q13_Part_14,Q13_Part_15,Q14_Part_1,Q14_Part_2,Q14_Part_3,Q14_Part_4,Q14_Part_5,Q14_Part_6,Q14_Part_7,Q14_Part_8,Q14_Part_9,Q14_Part_10,Q14_Part_11,Q15_Part_1,Q15_Part_2,Q15_Part_3,Q15_Part_4,Q15_Part_5,Q15_Part_6,Q15_Part_7,Q16_Part_1,Q16_Part_2,Q16_Part_3,Q16_Part_4,Q16_Part_5,Q16_Part_6,Q16_Part_7,Q16_Part_8,Q16_Part_9,Q16_Part_10,Q16_Part_11,Q16_Part_12,Q16_Part_13,Q16_Part_14,Q16_Part_15,Q16_Part_16,Q16_Part_17,Q16_Part_18,Q17,Q18,Q19_Part_1,Q19_Part_2,Q19_Part_3,Q19_Part_4,Q19_Part_5,Q19_Part_6,Q19_Part_7,Q19_Part_8,Q19_Part_9,Q19_Part_10,Q19_Part_11,Q19_Part_12,Q19_Part_13,Q19_Part_14,Q19_Part_15,Q19_Part_16,Q19_Part_17,Q19_Part_18,Q19_Part_19,Q20,Q21_Part_1,Q21_Part_2,Q21_Part_3,Q21_Part_4,Q21_Part_5,Q21_Part_6,Q21_Part_7,Q21_Part_8,Q21_Part_9,Q21_Part_10,Q21_Part_11,Q21_Part_12,Q21_Part_13,Q22,Q23,Q24,Q25,Q26,Q27_Part_1,Q27_Part_2,Q27_Part_3,Q27_Part_4,Q27_Part_5,Q27_Part_6,Q27_Part_7,Q27_Part_8,Q27_Part_9,Q27_Part_10,Q27_Part_11,Q27_Part_12,Q27_Part_13,Q27_Part_14,Q27_Part_15,Q27_Part_16,Q27_Part_17,Q27_Part_18,Q27_Part_19,Q27_Part_20,Q28_Part_1,Q28_Part_2,Q28_Part_3,Q28_Part_4,Q28_Part_5,Q28_Part_6,Q28_Part_7,Q28_Part_8,Q28_Part_9,Q28_Part_10,Q28_Part_11,Q28_Part_12,Q28_Part_13,Q28_Part_14,Q28_Part_15,Q28_Part_16,Q28_Part_17,Q28_Part_18,Q28_Part_19,Q28_Part_20,Q28_Part_21,Q28_Part_22,Q28_Part_23,Q28_Part_24,Q28_Part_25,Q28_Part_26,Q28_Part_27,Q28_Part_28,Q28_Part_29,Q28_Part_30,Q28_Part_31,Q28_Part_32,Q28_Part_33,Q28_Part_34,Q28_Part_35,Q28_Part_36,Q28_Part_37,Q28_Part_38,Q28_Part_39,Q28_Part_40,Q28_Part_41,Q28_Part_42,Q28_Part_43,Q29_Part_1,Q29_Part_2,Q29_Part_3,Q29_Part_4,Q29_Part_5,Q29_Part_6,Q29_Part_7,Q29_Part_8,Q29_Part_9,Q29_Part_10,Q29_Part_11,Q29_Part_12,Q29_Part_13,Q29_Part_14,Q29_Part_15,Q29_Part_16,Q29_Part_17,Q29_Part_18,Q29_Part_19,Q29_Part_20,Q29_Part_21,Q29_Part_22,Q29_Part_23,Q29_Part_24,Q29_Part_25,Q29_Part_26,Q29_Part_27,Q29_Part_28,Q30_Part_1,Q30_Part_2,Q30_Part_3,Q30_Part_4,Q30_Part_5,Q30_Part_6,Q30_Part_7,Q30_Part_8,Q30_Part_9,Q30_Part_10,Q30_Part_11,Q30_Part_12,Q30_Part_13,Q30_Part_14,Q30_Part_15,Q30_Part_16,Q30_Part_17,Q30_Part_18,Q30_Part_19,Q30_Part_20,Q30_Part_21,Q30_Part_22,Q30_Part_23,Q30_Part_24,Q30_Part_25,Q31_Part_1,Q31_Part_2,Q31_Part_3,Q31_Part_4,Q31_Part_5,Q31_Part_6,Q31_Part_7,Q31_Part_8,Q31_Part_9,Q31_Part_10,Q31_Part_11,Q31_Part_12,Q32,Q33_Part_1,Q33_Part_2,Q33_Part_3,Q33_Part_4,Q33_Part_5,Q33_Part_6,Q33_Part_7,Q33_Part_8,Q33_Part_9,Q33_Part_10,Q33_Part_11,Q34_Part_1,Q34_Part_2,Q34_Part_3,Q34_Part_4,Q34_Part_5,Q34_Part_6,Q35_Part_1,Q35_Part_2,Q35_Part_3,Q35_Part_4,Q35_Part_5,Q35_Part_6,Q36_Part_1,Q36_Part_2,Q36_Part_3,Q36_Part_4,Q36_Part_5,Q36_Part_6,Q36_Part_7,Q36_Part_8,Q36_Part_9,Q36_Part_10,Q36_Part_11,Q36_Part_12,Q36_Part_13,Q37,Q38_Part_1,Q38_Part_2,Q38_Part_3,Q38_Part_4,Q38_Part_5,Q38_Part_6,Q38_Part_7,Q38_Part_8,Q38_Part_9,Q38_Part_10,Q38_Part_11,Q38_Part_12,Q38_Part_13,Q38_Part_14,Q38_Part_15,Q38_Part_16,Q38_Part_17,Q38_Part_18,Q38_Part_19,Q38_Part_20,Q38_Part_21,Q38_Part_22,Q39_Part_1,Q39_Part_2,Q40,Q41_Part_1,Q41_Part_2,Q41_Part_3,Q42_Part_1,Q42_Part_2,Q42_Part_3,Q42_Part_4,Q42_Part_5,Q43,Q44_Part_1,Q44_Part_2,Q44_Part_3,Q44_Part_4,Q44_Part_5,Q44_Part_6,Q45_Part_1,Q45_Part_2,Q45_Part_3,Q45_Part_4,Q45_Part_5,Q45_Part_6,Q46,Q47_Part_1,Q47_Part_2,Q47_Part_3,Q47_Part_4,Q47_Part_5,Q47_Part_6,Q47_Part_7,Q47_Part_8,Q47_Part_9,Q47_Part_10,Q47_Part_11,Q47_Part_12,Q47_Part_13,Q47_Part_14,Q47_Part_15,Q47_Part_16,Q48,Q49_Part_1,Q49_Part_2,Q49_Part_3,Q49_Part_4,Q49_Part_5,Q49_Part_6,Q49_Part_7,Q49_Part_8,Q49_Part_9,Q49_Part_10,Q49_Part_11,Q49_Part_12,Q50_Part_1,Q50_Part_2,Q50_Part_3,Q50_Part_4,Q50_Part_5,Q50_Part_6,Q50_Part_7,Q50_Part_8
1,710,Female,45-49,United States of America,Doctoral degree,Other,Consultant,Other,,,I do not know,Analyze and understand data to influence product or business decisions,Build and/or run a machine learning service that operationally improves my product or workflows,"Build and/or run the data infrastructure that my business uses for storing, analyzing, and operationalizing data",,Do research that advances the state of the art of machine learning,,,"Cloud-based data software & APIs (AWS, GCP, Azure, etc.)",Jupyter/IPython,,,,,,,,,,,,,,,,,,,,,,,,,,,,Microsoft Azure,,,,,,,,,,,,,,,,,,,,,,,,Python,,,,,,,,,,,,,,,,,,,,,,Matplotlib,,,,,,,,,,,,,0% of my time,I have never written code but I want to learn,I have never studied machine learning but plan to learn in the future,Maybe,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Azure Machine Learning Studio,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Microsoft Access,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Twitter,,,,,,,,,,,,,,,,,,,,,,Much better,Much worse,Independent projects are equally important as academic achievements,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,434,Male,30-34,Indonesia,Bachelor’s degree,Engineering (non-computer focused),Other,Manufacturing/Fabrication,5-10,"10-20,000",No (we do not use ML methods),,,,,,None of these activities are an important part of my role at work,,"Basic statistical software (Microsoft Excel, Google Sheets, etc.)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,I have not used any cloud providers,,,,SQL,,,,,,,,,,,,,,,,,Python,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1% to 25% of my time,I have never written code but I want to learn,I have never studied machine learning but plan to learn in the future,Definitely not,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,None/I do not know,,Slightly worse,No opinion; I do not know,Independent projects are equally important as academic achievements,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,718,Female,30-34,United States of America,Master’s degree,"Computer science (software engineering, etc.)",Data Scientist,I am a student,0-1,"0-10,000",I do not know,Analyze and understand data to influence product or business decisions,,,,,,,"Local or hosted development environments (RStudio, JupyterLab, etc.)",,,,,,,MATLAB,,,,,,,,,,,,,,,,,,,,,,,,,I have not used any cloud providers,,,R,,,Java,,,,MATLAB,,,,,,,,,,Java,Python,,,,,,,,,,,,,,,,,,,,,ggplot2,Matplotlib,,,,,,Seaborn,,,,,,ggplot2,75% to 99% of my time,5-10 years,< 1 year,Definitely yes,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Categorical Data,,,,Numerical Data,,,Text Data,Time Series Data,,,Time Series Data,Government websites,,,"Dataset aggregator/platform (Socrata, Kaggle Public Datasets Platform, etc.)",,,,,GitHub,,,2.0,3.0,20.0,50.0,20.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,,,,DataCamp,,,,,Udemy,,,,,DataCamp,Twitter,,,,,,,,,,,,,,,,,,,,,,Slightly worse,Slightly better,Independent projects are equally important as academic achievements,Very important,Very important,Very important,,Metrics that consider accuracy,,,,0-10,Lack of communication between individuals who collect the data and individuals who analyze the data,,,,,,,,When determining whether it is worth it to put the model into production,,,,10-20,,Examine feature correlations,Examine feature importances,,,,,Plot predicted vs. actual results,,,,,,,,,I am confident that I can explain the outputs of most if not all ML models,,,,,,,Make sure the code is human-readable,Define all random seeds,,Include a text file describing all dependencies,,,,Too time-consuming,,,,,,
4,621,Male,35-39,United States of America,Master’s degree,"Social sciences (anthropology, psychology, sociology, etc.)",Not employed,,,,,,,,,,,,"Local or hosted development environments (RStudio, JupyterLab, etc.)",Jupyter/IPython,RStudio,PyCharm,,,,,Visual Studio,,,Vim,,,,,Kaggle Kernels,Google Colab,,,,,,,,,,Google Cloud Platform (GCP),Amazon Web Services (AWS),,,,,,Python,R,SQL,Bash,,Javascript/Typescript,,,,,,,,,,,,,Python,SQL,Scikit-Learn,TensorFlow,Keras,,Spark MLlib,,,,,Xgboost,,,,,,,,,,Scikit-Learn,ggplot2,Matplotlib,Altair,Shiny,D3,,Bokeh,Seaborn,,,Lattice,,,ggplot2,50% to 74% of my time,5-10 years,4-5 years,Probably yes,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Categorical Data,,Geospatial Data,,Numerical Data,,Tabular Data,Text Data,Time Series Data,,,Numerical Data,Government websites,,,"Dataset aggregator/platform (Socrata, Kaggle Public Datasets Platform, etc.)","I collect my own data (web-scraping, etc.)",,,,,,,20.0,25.0,15.0,10.0,10.0,20.0,40.0,40.0,10.0,0.0,10.0,0.0,,Coursera,edX,,,,,,,,,,,Coursera,,,,,,,,,,,,,,,,,,,,,None/I do not know,,Much worse,No opinion; I do not know,Independent projects are much more important than academic achievements,Very important,Very important,Very important,Revenue and/or business goals,Metrics that consider accuracy,,,,0-10,,,Difficulty in collecting enough data about groups that may be unfairly targeted,,,,,,,When building a model that was specifically designed to produce such insights,,,20-30,,Examine feature correlations,Examine feature importances,Plot decision boundaries,,,,Plot predicted vs. actual results,,Sensitivity analysis/perturbation importance,,,,,,,"Yes, most ML models are ""black boxes""",,,"Share data, code, and environment using a hosted service (Kaggle Kernels, Google Colaboratory, Amazon SageMaker, etc.)",,,,Make sure the code is human-readable,,Define relative rather than absolute file paths,,,,,,Requires too much technical knowledge,,Not enough incentives to share my work,,,
5,731,Male,22-24,India,Master’s degree,Mathematics or statistics,Data Analyst,I am a student,0-1,"0-10,000",I do not know,,,,,,,Other,"Advanced statistical software (SPSS, SAS, etc.)",,RStudio,,,,,,,,,,,,,,,,Azure Notebook,,,,,,,,,,,Microsoft Azure,,,,,,,SQL,,Java,,,,,,,,,,,,,,SQL,R,,,,,,,,,,,,Prophet,,,,,,,,,,,,Shiny,,,,,,,,,,,75% to 99% of my time,5-10 years,2-3 years,Maybe,,,,,Google Kubernetes Engine,,,,,,,,,,,,,,,,,,,,,,,Google Cloud Translation API,,,,,,,,,,,,,,,,,Cloudera,,,,,,,Azure Face API,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,IBM Cloud Compose,,,,,,,,,Google Cloud Dataflow,,,,,,,,,,,,,,,,,,,,,,,Categorical Data,,,,,,,,,,,,,,,,,Publicly released data from private companies,,,,,,10.0,10.0,20.0,10.0,20.0,23.0,20.0,40.0,20.0,20.0,0.0,0.0,Udacity,Coursera,edX,DataCamp,,,,,,,,,,Coursera,,,,Kaggle forums,,,DataTau News Aggregator,,,,,,,,,,,,,,,,Slightly better,Much better,Independent projects are slightly more important than academic achievements,Not at all important,Slightly important,Very important,,,Metrics that consider unfair bias,,,20-30,,,Difficulty in collecting enough data about groups that may be unfairly targeted,Difficulty in identifying and selecting the appropriate evaluation metrics,,,,For all models right before putting the model in production,When determining whether it is worth it to put the model into production,,,,20-30,,,,,Create partial dependence plots,,,,,,,,,,,,I am confident that I can understand and explain the outputs of many but not all ML models,,,,,,,,,Define relative rather than absolute file paths,,,,,Too time-consuming,,,Not enough incentives to share my work,,,


In [13]:
cols= df1.columns
for col in cols[1:]:
    print("\n")
    print(" ### Columne Name = "+ col )
    print(df1[col].value_counts())



 ### Columne Name = Q1
Male                       19430
Female                     4010 
Prefer not to say          340  
Prefer to self-describe    79   
Name: Q1, dtype: int64


 ### Columne Name = Q2
25-29    6159
22-24    5141
30-34    3776
18-21    3037
35-39    2253
40-44    1360
45-49    858 
50-54    582 
55-59    328 
60-69    273 
70-79    53  
80+      39  
Name: Q2, dtype: int64


 ### Columne Name = Q3
United States of America                                4716
India                                                   4417
China                                                   1644
Other                                                   1036
Russia                                                  879 
Brazil                                                  736 
Germany                                                 734 
United Kingdom of Great Britain and Northern Ireland    702 
France                                                  604 
Canada                        



 ### Columne Name = Q28_Part_7
Amazon Translate    160
Name: Q28_Part_7, dtype: int64


 ### Columne Name = Q28_Part_8
Google Cloud Translation API    530
Name: Q28_Part_8, dtype: int64


 ### Columne Name = Q28_Part_9
Amazon Lex    209
Name: Q28_Part_9, dtype: int64


 ### Columne Name = Q28_Part_10
Google Dialogflow Enterprise Edition    172
Name: Q28_Part_10, dtype: int64


 ### Columne Name = Q28_Part_11
Amazon Rekognition Video    130
Name: Q28_Part_11, dtype: int64


 ### Columne Name = Q28_Part_12
Google Cloud Video Intelligence API    174
Name: Q28_Part_12, dtype: int64


 ### Columne Name = Q28_Part_13
Google Cloud AutoML    432
Name: Q28_Part_13, dtype: int64


 ### Columne Name = Q28_Part_14
Amazon SageMaker    486
Name: Q28_Part_14, dtype: int64


 ### Columne Name = Q28_Part_15
Google Cloud Machine Learning Engine    665
Name: Q28_Part_15, dtype: int64


 ### Columne Name = Q28_Part_16
DataRobot    250
Name: Q28_Part_16, dtype: int64


 ### Columne Name = Q28_Part_17
H20

In [None]:
# save the data
filepath = r"Data\Cleaned\for-analysis.csv"       

joblib.dump(df1 ,filepath )
# load model for prediction
#df1 = joblib.load(filepath)