In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/temperature/tavg_data.csv
/kaggle/input/contagious-diseases/mumps.csv
/kaggle/input/contagious-diseases/measles.csv
/kaggle/input/contagious-diseases/pertussis.csv
/kaggle/input/contagious-diseases/hepatitis.csv
/kaggle/input/contagious-diseases/polio.csv
/kaggle/input/contagious-diseases/smallpox.csv
/kaggle/input/contagious-diseases/rubella.csv
/kaggle/input/precipitate/pcp.csv


In [2]:
import warnings
warnings.filterwarnings("ignore") 
import pandas as pd
import calendar
import numpy as np

import random
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from deap import creator, base, tools, algorithms
import sys

In [3]:
def week2month(hep):
    hep['LastDayWeek'] = pd.to_datetime((hep['week']-1).astype(str) + "6", format="%Y%U%w")
    hep['MonthMax'] = pd.DatetimeIndex(hep['LastDayWeek']).month
    hep['Year'] = pd.DatetimeIndex(hep['LastDayWeek']).year
    hep['MonthName'] = [calendar.month_name[i] for i in hep.MonthMax]
    return hep

In [4]:
def get_input_transform(file_):
    return week2month(pd.read_csv(file_))

In [5]:
# load the dataset from Kaggle
hep = get_input_transform('../input/contagious-diseases/hepatitis.csv')
mea = get_input_transform('../input/contagious-diseases/measles.csv')
mum = get_input_transform('../input/contagious-diseases/mumps.csv')
per = get_input_transform('../input/contagious-diseases/pertussis.csv')
pol = get_input_transform('../input/contagious-diseases/polio.csv')
rub = get_input_transform('../input/contagious-diseases/rubella.csv')
sma = get_input_transform('../input/contagious-diseases/smallpox.csv')

In [6]:
# for our exploratory purpose we examine data for 1960 through to 2011 for US states
# combining all the disease datasets is shown below

train_data = hep
for i in [mea,mum,per,pol,rub,sma]:
    train_data = train_data.append(i)
train_data = train_data.loc[(train_data['Year'] >= 1960) & (train_data['Year'] <=2011)]

# examine the dataset 
train_data.head()

Unnamed: 0,week,state,state_name,disease,cases,incidence_per_capita,LastDayWeek,MonthMax,Year,MonthName
0,196601,AL,ALABAMA,HEPATITIS A,5,0.14,1966-01-01,1,1966,January
1,196601,AR,ARKANSAS,HEPATITIS A,11,0.58,1966-01-01,1,1966,January
2,196601,AZ,ARIZONA,HEPATITIS A,6,0.37,1966-01-01,1,1966,January
3,196601,CA,CALIFORNIA,HEPATITIS A,89,0.47,1966-01-01,1,1966,January
4,196601,CO,COLORADO,HEPATITIS A,1,0.05,1966-01-01,1,1966,January


In [7]:
# some data discrepancies must be resolved (issues like \\N )
train_data_bad = train_data[train_data.cases==train_data.cases.astype(str).max()]
print(train_data_bad.head(10))


         week state    state_name disease cases  incidence_per_capita  \
73273  196026    FL       FLORIDA   POLIO    \N                   0.0   
73509  196031    ND  NORTH DAKOTA   POLIO    \N                   0.0   
73531  196032    AR      ARKANSAS   POLIO    \N                   0.0   
73911  196040    IA          IOWA   POLIO    \N                   0.0   
74204  196046    KY      KENTUCKY   POLIO    \N                   0.0   
74266  196047    NM    NEW MEXICO   POLIO    \N                   0.0   
75694  196138    KY      KENTUCKY   POLIO    \N                   0.0   
76051  196145    VA      VIRGINIA   POLIO    \N                   0.0   
76261  196150    IN       INDIANA   POLIO    \N                   0.0   
76531  196211    LA     LOUISIANA   POLIO    \N                   0.0   

      LastDayWeek  MonthMax  Year  MonthName  
73273  1960-06-25         6  1960       June  
73509  1960-07-30         7  1960       July  
73531  1960-08-06         8  1960     August  
73911  1

In [8]:
ind = list(train_data_bad.index)

In [9]:
train_data = train_data.drop(train_data.index[ind])

In [10]:
train_data.head(10)

Unnamed: 0,week,state,state_name,disease,cases,incidence_per_capita,LastDayWeek,MonthMax,Year,MonthName
0,196601,AL,ALABAMA,HEPATITIS A,5,0.14,1966-01-01,1,1966,January
1,196601,AR,ARKANSAS,HEPATITIS A,11,0.58,1966-01-01,1,1966,January
2,196601,AZ,ARIZONA,HEPATITIS A,6,0.37,1966-01-01,1,1966,January
3,196601,CA,CALIFORNIA,HEPATITIS A,89,0.47,1966-01-01,1,1966,January
4,196601,CO,COLORADO,HEPATITIS A,1,0.05,1966-01-01,1,1966,January
5,196601,CT,CONNECTICUT,HEPATITIS A,9,0.31,1966-01-01,1,1966,January
6,196601,DE,DELAWARE,HEPATITIS A,3,0.58,1966-01-01,1,1966,January
7,196601,FL,FLORIDA,HEPATITIS A,7,0.11,1966-01-01,1,1966,January
8,196601,GA,GEORGIA,HEPATITIS A,4,0.09,1966-01-01,1,1966,January
9,196601,HI,HAWAII,HEPATITIS A,3,0.42,1966-01-01,1,1966,January


In [11]:
# will be using visulisation.csv for our visualization purposes later

train_data.to_csv('for_visulisation.csv')

In [12]:
# examining the characteristics of the dataset
train_data.describe()
train_data.shape

(361935, 10)

In [13]:
# converting non-numerical data to lowercase (to keep consistencies over all future datasets included)

train_data.state_name = [i.lower() for i in train_data.state_name]
train_data.shape

(361935, 10)

In [14]:
join_tavg = pd.read_csv('../input/temperature/tavg_data.csv')
join_tavg.state_name = [i.lower() for i in join_tavg.state_name]

In [15]:
join_tavg.head(10)

Unnamed: 0,week,t_avg,tavg_anomaly,state_name
0,196001,44.5,-0.6,alabama
1,196002,44.4,-3.6,alabama
2,196003,45.3,-9.8,alabama
3,196004,64.1,1.3,alabama
4,196005,67.8,-2.9,alabama
5,196006,77.3,-0.2,alabama
6,196007,81.1,1.2,alabama
7,196008,79.1,-0.2,alabama
8,196009,74.6,-0.1,alabama
9,196010,65.1,1.2,alabama


In [16]:
result = pd.merge(train_data, join_tavg, on=['week', 'state_name'])
train_data = result

In [17]:
result.head(10)

Unnamed: 0,week,state,state_name,disease,cases,incidence_per_capita,LastDayWeek,MonthMax,Year,MonthName,t_avg,tavg_anomaly
0,196601,AL,alabama,HEPATITIS A,5,0.14,1966-01-01,1,1966,January,40.0,-5.1
1,196601,AL,alabama,RUBELLA,7,0.2,1966-01-01,1,1966,January,40.0,-5.1
2,196601,AR,arkansas,HEPATITIS A,11,0.58,1966-01-01,1,1966,January,34.7,-4.6
3,196601,AR,arkansas,MEASLES,2,0.11,1966-01-01,1,1966,January,34.7,-4.6
4,196601,AZ,arizona,HEPATITIS A,6,0.37,1966-01-01,1,1966,January,37.8,-2.9
5,196601,AZ,arizona,MEASLES,98,6.07,1966-01-01,1,1966,January,37.8,-2.9
6,196601,AZ,arizona,RUBELLA,29,1.8,1966-01-01,1,1966,January,37.8,-2.9
7,196601,CA,california,HEPATITIS A,89,0.47,1966-01-01,1,1966,January,40.8,-1.5
8,196601,CA,california,MEASLES,102,0.54,1966-01-01,1,1966,January,40.8,-1.5
9,196601,CA,california,RUBELLA,7,0.04,1966-01-01,1,1966,January,40.8,-1.5


In [18]:
train_data.head(10)

Unnamed: 0,week,state,state_name,disease,cases,incidence_per_capita,LastDayWeek,MonthMax,Year,MonthName,t_avg,tavg_anomaly
0,196601,AL,alabama,HEPATITIS A,5,0.14,1966-01-01,1,1966,January,40.0,-5.1
1,196601,AL,alabama,RUBELLA,7,0.2,1966-01-01,1,1966,January,40.0,-5.1
2,196601,AR,arkansas,HEPATITIS A,11,0.58,1966-01-01,1,1966,January,34.7,-4.6
3,196601,AR,arkansas,MEASLES,2,0.11,1966-01-01,1,1966,January,34.7,-4.6
4,196601,AZ,arizona,HEPATITIS A,6,0.37,1966-01-01,1,1966,January,37.8,-2.9
5,196601,AZ,arizona,MEASLES,98,6.07,1966-01-01,1,1966,January,37.8,-2.9
6,196601,AZ,arizona,RUBELLA,29,1.8,1966-01-01,1,1966,January,37.8,-2.9
7,196601,CA,california,HEPATITIS A,89,0.47,1966-01-01,1,1966,January,40.8,-1.5
8,196601,CA,california,MEASLES,102,0.54,1966-01-01,1,1966,January,40.8,-1.5
9,196601,CA,california,RUBELLA,7,0.04,1966-01-01,1,1966,January,40.8,-1.5


In [19]:
join_pcp = pd.read_csv("../input/precipitate/pcp.csv")
join_pcp.state_name = [i.lower() for i in join_pcp.state_name]

In [20]:
result = pd.merge(train_data, join_pcp, on=['week', 'state_name'])
train_data = result

In [21]:
train_data.head(10)

Unnamed: 0,week,state,state_name,disease,cases,incidence_per_capita,LastDayWeek,MonthMax,Year,MonthName,t_avg,tavg_anomaly,precipitate,pcp_anomaly
0,196601,AL,alabama,HEPATITIS A,5,0.14,1966-01-01,1,1966,January,40.0,-5.1,5.96,0.78
1,196601,AL,alabama,RUBELLA,7,0.2,1966-01-01,1,1966,January,40.0,-5.1,5.96,0.78
2,196601,AR,arkansas,HEPATITIS A,11,0.58,1966-01-01,1,1966,January,34.7,-4.6,4.3,0.26
3,196601,AR,arkansas,MEASLES,2,0.11,1966-01-01,1,1966,January,34.7,-4.6,4.3,0.26
4,196601,AZ,arizona,HEPATITIS A,6,0.37,1966-01-01,1,1966,January,37.8,-2.9,0.76,-0.38
5,196601,AZ,arizona,MEASLES,98,6.07,1966-01-01,1,1966,January,37.8,-2.9,0.76,-0.38
6,196601,AZ,arizona,RUBELLA,29,1.8,1966-01-01,1,1966,January,37.8,-2.9,0.76,-0.38
7,196601,CA,california,HEPATITIS A,89,0.47,1966-01-01,1,1966,January,40.8,-1.5,3.05,-1.2
8,196601,CA,california,MEASLES,102,0.54,1966-01-01,1,1966,January,40.8,-1.5,3.05,-1.2
9,196601,CA,california,RUBELLA,7,0.04,1966-01-01,1,1966,January,40.8,-1.5,3.05,-1.2


In [22]:
from sklearn.preprocessing import MinMaxScaler as mm

scaler = mm()

# avg temp deviations
scaler.fit(train_data['tavg_anomaly'].values.reshape(-1,1))
train_data['tavg_anomaly'] = scaler.transform(train_data['tavg_anomaly'].values.reshape(-1,1))

# avg temp
scaler.fit(train_data['t_avg'].values.reshape(-1,1))
train_data['t_avg'] = scaler.transform(train_data['t_avg'].values.reshape(-1,1))

# precipitation deviations
pcp_anomaly = scaler.fit_transform(train_data['pcp_anomaly'].values.reshape(-1,1))
train_data['pcp_anomaly'] = pcp_anomaly

# precipitation
scaler.fit(train_data['precipitate'].values.reshape(-1,1))
train_data['precipitate'] = scaler.transform(train_data['precipitate'].values.reshape(-1,1))

In [23]:
train_data.head()

Unnamed: 0,week,state,state_name,disease,cases,incidence_per_capita,LastDayWeek,MonthMax,Year,MonthName,t_avg,tavg_anomaly,precipitate,pcp_anomaly
0,196601,AL,alabama,HEPATITIS A,5,0.14,1966-01-01,1,1966,January,0.482105,0.374613,0.374372,0.347554
1,196601,AL,alabama,RUBELLA,7,0.2,1966-01-01,1,1966,January,0.482105,0.374613,0.374372,0.347554
2,196601,AR,arkansas,HEPATITIS A,11,0.58,1966-01-01,1,1966,January,0.426316,0.390093,0.270101,0.317975
3,196601,AR,arkansas,MEASLES,2,0.11,1966-01-01,1,1966,January,0.426316,0.390093,0.270101,0.317975
4,196601,AZ,arizona,HEPATITIS A,6,0.37,1966-01-01,1,1966,January,0.458947,0.442724,0.047739,0.28157


In [24]:
train_data.to_csv('result.csv',index=False)

In [25]:
train_data.head(10)

Unnamed: 0,week,state,state_name,disease,cases,incidence_per_capita,LastDayWeek,MonthMax,Year,MonthName,t_avg,tavg_anomaly,precipitate,pcp_anomaly
0,196601,AL,alabama,HEPATITIS A,5,0.14,1966-01-01,1,1966,January,0.482105,0.374613,0.374372,0.347554
1,196601,AL,alabama,RUBELLA,7,0.2,1966-01-01,1,1966,January,0.482105,0.374613,0.374372,0.347554
2,196601,AR,arkansas,HEPATITIS A,11,0.58,1966-01-01,1,1966,January,0.426316,0.390093,0.270101,0.317975
3,196601,AR,arkansas,MEASLES,2,0.11,1966-01-01,1,1966,January,0.426316,0.390093,0.270101,0.317975
4,196601,AZ,arizona,HEPATITIS A,6,0.37,1966-01-01,1,1966,January,0.458947,0.442724,0.047739,0.28157
5,196601,AZ,arizona,MEASLES,98,6.07,1966-01-01,1,1966,January,0.458947,0.442724,0.047739,0.28157
6,196601,AZ,arizona,RUBELLA,29,1.8,1966-01-01,1,1966,January,0.458947,0.442724,0.047739,0.28157
7,196601,CA,california,HEPATITIS A,89,0.47,1966-01-01,1,1966,January,0.490526,0.486068,0.191583,0.234926
8,196601,CA,california,MEASLES,102,0.54,1966-01-01,1,1966,January,0.490526,0.486068,0.191583,0.234926
9,196601,CA,california,RUBELLA,7,0.04,1966-01-01,1,1966,January,0.490526,0.486068,0.191583,0.234926


In [26]:
train_data.duplicated(subset=None, keep='first').sum()

0

In [27]:
# removing the useless attributes from the dataset 

# storing unique state_name and diseases for label encoding 
states = np.asarray(train_data.state_name.unique())
#dis = np.unique(train_data['disease'].values)

week       = train_data.pop('week')
LOW        = train_data.pop('LastDayWeek')
monthN     = train_data.pop('MonthName')
state     = train_data.pop('state')

In [28]:
y = train_data.pop('disease')
X = train_data
print(X)

          state_name cases  incidence_per_capita  MonthMax  Year     t_avg  \
0            alabama     5                  0.14         1  1966  0.482105   
1            alabama     7                  0.20         1  1966  0.482105   
2           arkansas    11                  0.58         1  1966  0.426316   
3           arkansas     2                  0.11         1  1966  0.426316   
4            arizona     6                  0.37         1  1966  0.458947   
...              ...   ...                   ...       ...   ...       ...   
59689         oregon     0                  0.00         3  1993  0.593684   
59690         oregon     0                  0.00         3  1993  0.414737   
59691  massachusetts     0                  0.00         1  1994  0.636842   
59692          texas     1                  0.00         1  1998  0.640000   
59693    mississippi     1                  0.04         1  1999  0.622105   

       tavg_anomaly  precipitate  pcp_anomaly  
0          0.37

In [29]:
print(y)

0        HEPATITIS A
1            RUBELLA
2        HEPATITIS A
3            MEASLES
4        HEPATITIS A
            ...     
59689        RUBELLA
59690        RUBELLA
59691        RUBELLA
59692        RUBELLA
59693        RUBELLA
Name: disease, Length: 59694, dtype: object


In [30]:
# check dimensions

print(X.shape)
print(y.shape)

(59694, 9)
(59694,)


In [31]:
train_data.describe()

Unnamed: 0,incidence_per_capita,MonthMax,Year,t_avg,tavg_anomaly,precipitate,pcp_anomaly
count,59694.0,59694.0,59694.0,59694.0,59694.0,59694.0,59694.0
mean,0.732349,2.060174,1982.123647,0.627913,0.53781,0.198397,0.308903
std,2.810295,0.792493,13.024704,0.179055,0.093729,0.124613,0.08294
min,0.0,1.0,1960.0,0.0,0.0,0.0,0.0
25%,0.01,1.0,1972.0,0.495789,0.48065,0.103643,0.258817
50%,0.08,2.0,1980.0,0.647368,0.5387,0.184045,0.299772
75%,0.38,3.0,1991.0,0.774737,0.597523,0.270101,0.348692
max,143.95,3.0,2011.0,1.0,1.0,1.0,1.0


In [32]:
train_data.head(10)

Unnamed: 0,state_name,cases,incidence_per_capita,MonthMax,Year,t_avg,tavg_anomaly,precipitate,pcp_anomaly
0,alabama,5,0.14,1,1966,0.482105,0.374613,0.374372,0.347554
1,alabama,7,0.2,1,1966,0.482105,0.374613,0.374372,0.347554
2,arkansas,11,0.58,1,1966,0.426316,0.390093,0.270101,0.317975
3,arkansas,2,0.11,1,1966,0.426316,0.390093,0.270101,0.317975
4,arizona,6,0.37,1,1966,0.458947,0.442724,0.047739,0.28157
5,arizona,98,6.07,1,1966,0.458947,0.442724,0.047739,0.28157
6,arizona,29,1.8,1,1966,0.458947,0.442724,0.047739,0.28157
7,california,89,0.47,1,1966,0.490526,0.486068,0.191583,0.234926
8,california,102,0.54,1,1966,0.490526,0.486068,0.191583,0.234926
9,california,7,0.04,1,1966,0.490526,0.486068,0.191583,0.234926


In [33]:
# label encoding of useful non-numerical attributes

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(y)

y = le.transform(y)

le2 = LabelEncoder()
le2.fit(states)

X['state_name'] = le2.transform(X.state_name)

Genetic Algorithm for feature selection

In [34]:
def avg(l):
    """
    Returns the average between list elements
    """
   
    average = (sum(l)/float(len(l)))
    
    
    return average

In [35]:
def getFitness(individual, X, y):
    """
    Feature subset fitness function
    """

    if(individual.count(0) != len(individual)):
        # Getting index with value 0
        cols = [index for index in range(
            len(individual)) if individual[index] == 0]

        # Getting features subset
        X_parsed = X.drop(X.columns[cols], axis=1)
        X_subset = pd.get_dummies(X_parsed)

        # Applying classification algorithm
        clf = DecisionTreeClassifier(max_depth=10)
       
        # Applying K-fold cross validation
        return (avg(cross_val_score(clf, X_subset, y, cv=5)),)
    else:
        return(0,)

In [36]:
def geneticAlgorithm(X, y, n_population, n_generation):
    """
    Deap global variables
    Initialize variables to use eaSimple
    """
    # create individual
    creator.create("FitnessMax", base.Fitness, weights=(1.0,))
    creator.create("Individual", list, fitness=creator.FitnessMax)

    # create toolbox
    toolbox = base.Toolbox()
    toolbox.register("attr_bool", random.randint, 0, 1)
    toolbox.register("individual", tools.initRepeat,
                     creator.Individual, toolbox.attr_bool, len(X.columns))
    toolbox.register("population", tools.initRepeat, list,
                     toolbox.individual)
    toolbox.register("evaluate", getFitness, X=X, y=y)
    toolbox.register("mate", tools.cxOnePoint)
    toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
    toolbox.register("select", tools.selTournament, tournsize=3)

    # initialize parameters
    pop = toolbox.population(n=n_population)
    hof = tools.HallOfFame(n_population * n_generation)
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("avg", np.mean)
    stats.register("min", np.min)
    stats.register("max", np.max)

    # genetic algorithm
    pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2,
                                   ngen=n_generation, stats=stats, halloffame=hof,
                                   verbose=True)

    # return hall of fame
    return hof

In [37]:
def bestIndividual(hof, X, y):
    """
    Get the best individual
    """
    
    
    maxAccurcy = 0.0
    maxAccuracyList = []
    for individual in hof:
        ind = individual.fitness.values
        if(ind[0] > maxAccurcy):
            maxAccurcy = ind[0]
            maxAccuracyList.append(maxAccurcy)
            _individual = individual

    _individualHeader = [list(X)[i] for i in range(
        len(_individual)) if _individual[i] == 1]
    return _individual.fitness.values, _individual, _individualHeader,maxAccuracyList

In [38]:
if __name__ == '__main__':

    n_pop = 10
    n_gen = 15


    # get accuracy with all features
    individual = [1 for i in range(len(X.columns))]
    print("Accuracy with all features: \t" +
          str(getFitness(individual, X, y)) + "\n")

    # apply genetic algorithm
    hof = geneticAlgorithm(X, y, n_pop, n_gen)

    # select the best individual
    accuracy, individual, header , testAccuracyList= bestIndividual(hof, X, y)
    print('Best Accuracy: \t' + str(accuracy))
    print('Number of Features in Subset: \t' + str(individual.count(1)))
    print('Individual: \t\t' + str(individual))
    print('Feature Subset\t: ' + str(header))

    print('\n\ncreating a new classifier with the result')

    # read dataframe from csv one more time
    #df = pd.read_csv(dataframePath, sep=',')

    # with feature subset
    X = X[header]

    clf = DecisionTreeClassifier(max_depth=10)
    #clf = SVC()

    scores = cross_val_score(clf, X, y, cv=5)
    print("Accuracy with Feature Subset: \t" + str(avg(scores)) + "\n")

0.14748800062011772
Accuracy with all features: 	(0.14775602836505236,)

0.07908804334020998
0.3081200491845162
0.14629862127511392
0.28237356419396187
0.13115482285411556
0.28470202532163164
0.13564446544183223
0.14380264192983794
0.15147498752208183
0.32917726173554007
gen	nevals	avg     	min     	max     
0  	10    	0.199205	0.079021	0.329077
0.3098456061008513
0.26960841641386096
0.31738392168342305
1  	3     	0.279869	0.151358	0.317367
0.3081870606812643
0.30818705647155864
0.2737795821449042
0.3097785988138088


KeyboardInterrupt: 

In [None]:
pip install chart_studio

In [None]:
# using plotly for the beautiful plots 

import chart_studio
import pandas as pd

# login api for plotly (dont forget to sign up to plotly)
chart_studio.tools.set_credentials_file(username= 'ab-bh', api_key ='KeUFpD51Wy55BOfM9Czx')



In [None]:
import pandas as pd 
import numpy as np
import plotly.offline as py

def get_viz(the_yr_data, yr):
    py.init_notebook_mode(connected=True)


    for col in the_yr_data.columns:
        the_yr_data[col] = the_yr_data[col].astype(str)

    scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],\
                [0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]
    scl = [
            # Let first 10% (0.1) of the values have color rgb(0, 0, 0)
            [0, 'rgb(0, 0, 0)'],
            [0.1, 'rgb(0, 0, 0)'],

            # Let values between 10-20% of the min and max of z
            # have color rgb(20, 20, 20)
            [0.1, 'rgb(20, 20, 20)'],
            [0.2, 'rgb(20, 20, 20)'],

            # Values between 20-30% of the min and max of z
            # have color rgb(40, 40, 40)
            [0.2, 'rgb(40, 40, 40)'],
            [0.3, 'rgb(40, 40, 40)'],

            [0.3, 'rgb(60, 60, 60)'],
            [0.4, 'rgb(60, 60, 60)'],

            [0.4, 'rgb(80, 80, 80)'],
            [0.5, 'rgb(80, 80, 80)'],

            [0.5, 'rgb(100, 100, 100)'],
            [0.6, 'rgb(100, 100, 100)'],

            [0.6, 'rgb(120, 120, 120)'],
            [0.7, 'rgb(120, 120, 120)'],

            [0.7, 'rgb(140, 140, 140)'],
            [0.8, 'rgb(140, 140, 140)'],

            [0.8, 'rgb(160, 160, 160)'],
            [0.9, 'rgb(160, 160, 160)'],

            [0.9, 'rgb(180, 180, 180)'],
            [1.0, 'rgb(180, 180, 180)']
        ]
    data = [ dict(
            type='choropleth',
            colorscale = scl,
            autocolorscale = True,
            locations = the_yr_data['state'],
            z = the_yr_data['cases'].astype(float),
            zmin=0,
            zmax=500,
            locationmode = 'USA-states',
            text = the_yr_data['text'],
            marker = dict(
                line = dict (
                    color = 'rgb(255,255,255)',
                    width = 2
                )
            ),
            colorbar = dict(
                title = "Disease outbreak - cases in %d" %(yr)
            )
        ) ]

    layout = dict(
        title = '%d US Diseases Cases Found by State<br>(Hover for breakdown)' %(yr),
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showlakes = True,
            lakecolor = 'rgb(255, 255, 255)',
            ),
    )

    fig = dict( data=data, layout=layout )

    url = py.iplot( fig, validate=False)

def get_1yr_viz(yr):
    data = pd.read_csv('for_visulisation.csv')
    the_yr_data = data.loc[data['Year'] ==yr]
    from collections import defaultdict
    har = defaultdict(set)
    a = list(the_yr_data.state_name)
    b = list(the_yr_data.disease)
    #print len(a),len(b)
    for i in range(len(a)):
        har[a[i]].add(b[i])
    
    the_yr_data['disease_all'] = [' '.join(list(har[i])) for i in the_yr_data.state_name]
    
    the_yr_data['text'] = the_yr_data['state_name'] + '<br>' +\
    'Disease '+the_yr_data['disease_all']
    
    tf = the_yr_data.filter(['state_name','state','cases'], axis=1)
    tf.cases = tf.cases.astype(int)
    the_yr_data_2 = tf.groupby(['state_name','state']).sum().reset_index()
    the_yr_data_2['disease_all'] = [' '.join(list(har[i])) for i in the_yr_data_2.state_name]
    the_yr_data_2['text'] = the_yr_data_2['state_name'] + '<br>' +\
    'Disease '+the_yr_data_2['disease_all']
    
    get_viz(the_yr_data_2, yr)

In [None]:
get_1yr_viz(2011)

In [None]:
get_1yr_viz(1970)