In [25]:
import numpy as np
import pandas as pd

import plotly.graph_objs as go
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot
import plotly.figure_factory as ff

init_notebook_mode(connected=True)

In [26]:
Degree_df = pd.read_csv("d:/UCSD_ECE143/project/degrees-that-pay-back.csv")
College_df = pd.read_csv("d:/UCSD_ECE143/project/salaries-by-college-type.csv")
Region_df = pd.read_csv("d:/UCSD_ECE143/project/salaries-by-region.csv")

In [27]:
def string_dollar_to_integer(data, column):
    '''
    change string dollar to integer dollar
    example : $75,000.00 to 75000
    data: pandas.core.frame.DataFrame
    column: valid column name in data
    '''
    assert isinstance(column, str)
    assert data.columns.contains(column)
    data[column] = data[column].apply(lambda x: int(x.replace(",","").strip('$').split('.')[0]) if isinstance(x, str) else x)

In [28]:
for df in [Degree_df, College_df, Region_df]:
    string_dollar_to_integer(df, 'Starting Median Salary')
    string_dollar_to_integer(df, 'Mid-Career Median Salary')
    string_dollar_to_integer(df, 'Mid-Career 10th Percentile Salary')
    string_dollar_to_integer(df, 'Mid-Career 25th Percentile Salary')
    string_dollar_to_integer(df, 'Mid-Career 75th Percentile Salary')
    string_dollar_to_integer(df, 'Mid-Career 90th Percentile Salary')


In [61]:
def visulize_region_difference(Value, title_str):
    '''
    docstring
    '''
    state = ['CA','WA','OR','MT','ID','WY','UT','CO','NV','AK',
         'LA','AR','TN','GA','SC','FL','MS','KY','NC','WV','AZ','NM','TX','OK',
         'IL','SD','NE','KS','MO','IA','MN','WI','MI','OH','ND',
         'PA','NY','VT','ME','DE','NH','MA','DC']
         
    Text = ['California','Western','Western','Western','Western','Western','Western','Western','Western','Western',
            'Southern','Southern','Southern','Southern','Southern','Southern','Southern','Southern','Southern','Southern','Southern','Southern','Southern','Southern',
            'Midwestern','Midwestern','Midwestern','Midwestern','Midwestern','Midwestern','Midwestern','Midwestern','Midwestern','Midwestern','Midwestern',
            'Northestern','Northestern','Northestern','Northestern','Northestern','Northestern','Northestern','Northestern']

    data = dict(type = 'choropleth',
               locations = state,
               locationmode = 'USA-states',
               colorscale = 'Portland',#['Greys', 'YlGnBu', 'Greens', 'YlOrRd', 'Bluered', 'RdBu',
            #'Reds', 'Blues', 'Picnic', 'Rainbow', 'Portland', 'Jet',
            #'Hot', 'Blackbody', 'Earth', 'Electric', 'Viridis', 'Cividis'],
               text = Text,
               marker = dict (line = dict(color = 'rgb(255,255,255)',width=2)),
               z = Value,
               colorbar = {'title': title_str})

    layout = dict(title = 'Region wise divide to show '+ title_str,
             geo=dict(scope = 'usa',showlakes = False,lakecolor='rgb(85,173,240)')) 

    choromap2 = go.Figure(data = [data],layout=layout)
    iplot(choromap2)

In [62]:
Value = [51032,44414,44414,44414,44414,44414,44414,44414,44414,44414,
            44522,44522,44522,44522,44522,44522,44522,44522,44522,44522,44522,44522,44522,44522,
            44225,44225,44225,44225,44225,44225,44225,44225,44225,44225,44225,
            48496,48496,48496,48496,48496,48496,48496,48496]
visulize_region_difference(Value, 'Starting median salary')

In [63]:
Value = [93132,78200,78200,78200,78200,78200,78200,78200,78200,78200,
        79505,79505,79505,79505,79505,79505,79505,79505,79505,79505,79505,79505,79505,79505,
        78180,78180,78180,78180,78180,78180,78180,78180,78180,78180,78180,
        91352,91352,91352,91352,91352,91352,91352,91352]
visulize_region_difference(Value, 'Mid-career median salary')

In [33]:
Public_College_df = College_df[(College_df['School Type'] == 'State')]
Private_College_df = College_df[(College_df['School Type'] != 'State') & (~College_df['School Name'].isin(Public_College_df))]

In [34]:
import scipy
from scipy.optimize import curve_fit

In [42]:
def visulize_schooltype_difference(Public_College_df, Private_College_df, stage_str):
    '''
    docstring
    '''
    hist_data = [Public_College_df[stage_str], Private_College_df[stage_str]]

    group_labels = ['Public College', 'Private College']
    colors = ['#A6ACEC', '#63F5EF']

    fig = ff.create_distplot(hist_data, group_labels, colors=colors, bin_size=2000, show_rug=False)

    fig['layout'].update(title=stage_str + ': Public vs Private', legend=dict(x=0.65, y=0.8))

    iplot(fig, filename='Hist and Curve')

In [43]:
visulize_schooltype_difference(Public_College_df, Private_College_df, "Starting Median Salary")

In [44]:
visulize_schooltype_difference(Public_College_df, Private_College_df, "Mid-Career Median Salary")