In [1]:
import pandas as pd
import env

# for presentation purposes
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('anonymized-curriculum-access.txt', sep=" ")

In [3]:
def acquire_logs(user=env.user, password=env.password, host=env.host):
    '''
    This function gathers curriculum_logs data from the 
    SQL codeup database and returns the information in a 
    pandas dataframe
    '''
    url = f'mysql+pymysql://{env.user}:{env.password}@{env.host}/curriculum_logs'
    query = '''
    select * from cohorts;
    '''
    df = pd.read_sql(query, url)
    df.to_csv('cohorts.csv')

In [4]:
# a function that show a summary of the dataset
def data_summary(df):
    # Print the shape of the DataFrame
    print(f'data shape: {df.shape}')
    # set all the columns names to a lowercase
    df.columns = df.columns.str.lower()
    # Create a summary DataFrame
    summary = pd.DataFrame(df.dtypes, columns=['data type'])
    # Calculate the number of missing values
    summary['#missing'] = df.isnull().sum().values 
    # Calculate the percentage of missing values
    summary['%missing'] = df.isnull().sum().values / len(df)* 100
    # Calculate the number of unique values
    summary['#unique'] = df.nunique().values
    # Create a descriptive DataFrame
    desc = pd.DataFrame(df.describe(include='all').transpose())
    # Add the minimum, maximum, and first three values to the summary DataFrame
    summary['count'] = desc['count'].values
    summary['mean'] = desc['mean'].values
    summary['std'] = desc['std'].values
    summary['min'] = desc['min'].values
    summary['25%'] = desc['25%'].values
    summary['50%'] = desc['50%'].values
    summary['75%'] = desc['75%'].values
    summary['max'] = desc['max'].values
    summary['first_value'] = df.loc[0].values
    summary['second_value'] = df.loc[1].values
    summary['third_value'] = df.loc[2].values
    
    # Return the summary DataFrame
    return summary

In [5]:
acquire_logs()

In [6]:
df_2 = pd.read_csv('cohorts.csv')

In [7]:
df_2 = df_2.drop(columns ='Unnamed: 0')

In [8]:
data_summary(df_2)

data shape: (53, 9)


Unnamed: 0,data type,#missing,%missing,#unique,count,mean,std,min,25%,50%,75%,max,first_value,second_value,third_value
id,int64,0,0.0,53,53.0,43.45283,42.982808,1.0,14.0,27.0,57.0,139.0,1,2,3
name,object,0,0.0,53,53.0,,,,,,,,Arches,Badlands,Carlsbad
slack,object,0,0.0,52,53.0,,,,,,,,#arches,#badlands,#carlsbad
start_date,object,0,0.0,49,53.0,,,,,,,,2014-02-04,2014-06-04,2014-09-04
end_date,object,0,0.0,50,53.0,,,,,,,,2014-04-22,2014-08-22,2014-11-05
created_at,object,0,0.0,41,53.0,,,,,,,,2016-06-14 19:52:26,2016-06-14 19:52:26,2016-06-14 19:52:26
updated_at,object,0,0.0,41,53.0,,,,,,,,2016-06-14 19:52:26,2016-06-14 19:52:26,2016-06-14 19:52:26
deleted_at,float64,53,100.0,0,0.0,,,,,,,,,,
program_id,int64,0,0.0,4,53.0,1.924528,0.729833,1.0,1.0,2.0,2.0,4.0,1,1,1


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900222 entries, 0 to 900221
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   2018-01-26    900222 non-null  object 
 1   09:55:03      900222 non-null  object 
 2   /             900221 non-null  object 
 3   1             900222 non-null  int64  
 4   8             847329 non-null  float64
 5   97.105.19.61  900222 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 41.2+ MB


In [10]:
df.loc[len(df.index)] = ['2018-01-26', '09:55:03', '/', 1, 8 , '97.105.19.61']
df.columns = ['date', 'time', 'path', 'user_id', 'cohort_id', 'ip']
df.shape

(900223, 6)

In [11]:
data_summary(df)

data shape: (900223, 6)


Unnamed: 0,data type,#missing,%missing,#unique,count,mean,std,min,25%,50%,75%,max,first_value,second_value,third_value
date,object,0,0.0,1182,900223.0,,,,,,,,2018-01-26,2018-01-26,2018-01-26
time,object,0,0.0,73167,900223.0,,,,,,,,09:56:02,09:56:05,09:56:06
path,object,1,0.000111,2313,900222.0,,,,,,,,java-ii,java-ii/object-oriented-programming,slides/object_oriented_programming
user_id,int64,0,0.0,981,900223.0,458.825707,249.296767,1.0,269.0,475.0,660.0,981.0,1,1,1
cohort_id,float64,52893,5.875544,47,847330.0,48.501049,32.795482,1.0,28.0,33.0,57.0,139.0,8.0,8.0,8.0
ip,object,0,0.0,5531,900223.0,,,,,,,,97.105.19.61,97.105.19.61,97.105.19.61


In [12]:
df_2

Unnamed: 0,id,name,slack,start_date,end_date,created_at,updated_at,deleted_at,program_id
0,1,Arches,#arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
1,2,Badlands,#badlands,2014-06-04,2014-08-22,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
2,3,Carlsbad,#carlsbad,2014-09-04,2014-11-05,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
3,4,Denali,#denali,2014-10-20,2015-01-18,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
4,5,Everglades,#everglades,2014-11-18,2015-02-24,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
5,6,Franklin,#franklin,2015-02-03,2015-05-26,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
6,7,Glacier,#glacier,2015-06-05,2015-10-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
7,8,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
8,9,Apollo,#apollo,2015-03-30,2015-07-29,2016-06-14 19:52:26,2016-06-14 19:52:26,,4
9,10,Balboa,#balboa,2015-11-03,2016-03-11,2016-06-14 19:52:26,2016-06-14 19:52:26,,4


In [13]:
df

Unnamed: 0,date,time,path,user_id,cohort_id,ip
0,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61
1,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61
2,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61
3,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61
4,2018-01-26,09:56:41,javascript-i/loops,2,22.0,97.105.19.61
...,...,...,...,...,...,...
900218,2021-04-21,16:42:02,jquery/mapbox-api,64,28.0,71.150.217.33
900219,2021-04-21,16:42:09,jquery/ajax/weather-map,64,28.0,71.150.217.33
900220,2021-04-21,16:44:37,anomaly-detection/discrete-probabilistic-methods,744,28.0,24.160.137.86
900221,2021-04-21,16:44:39,jquery/mapbox-api,64,28.0,71.150.217.33


In [14]:
df = df.merge(df_2, left_on='user_id', right_on='id')
df

Unnamed: 0,date,time,path,user_id,cohort_id,ip,id,name,slack,start_date,end_date,created_at,updated_at,deleted_at,program_id
0,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61,1,Arches,#arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
1,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61,1,Arches,#arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
2,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61,1,Arches,#arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
3,2018-01-26,10:40:15,javascript-i/functions,1,8.0,97.105.19.61,1,Arches,#arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
4,2018-01-26,11:26:13,java-i,1,8.0,97.105.19.61,1,Arches,#arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73734,2021-02-26,12:04:25,content/gitbook/images/favicon.ico,139,14.0,174.25.169.61,139,Oberon,#oberon,2021-04-12,2021-10-01,2021-04-12 18:07:21,2021-04-12 18:07:21,,2
73735,2021-02-26,12:04:52,content/html-css/elements.html,139,14.0,174.25.169.61,139,Oberon,#oberon,2021-04-12,2021-10-01,2021-04-12 18:07:21,2021-04-12 18:07:21,,2
73736,2021-02-26,12:05:13,content/html-css/css-i,139,14.0,174.25.169.61,139,Oberon,#oberon,2021-04-12,2021-10-01,2021-04-12 18:07:21,2021-04-12 18:07:21,,2
73737,2021-02-26,12:05:18,content/html-css/css-ii,139,14.0,174.25.169.61,139,Oberon,#oberon,2021-04-12,2021-10-01,2021-04-12 18:07:21,2021-04-12 18:07:21,,2


In [15]:
data_summary(df)

data shape: (73739, 15)


Unnamed: 0,data type,#missing,%missing,#unique,count,mean,std,min,25%,50%,75%,max,first_value,second_value,third_value
date,object,0,0.0,1050,73739.0,,,,,,,,2018-01-26,2018-01-26,2018-01-26
time,object,0,0.0,35270,73739.0,,,,,,,,09:56:02,09:56:05,09:56:06
path,object,0,0.0,1442,73739.0,,,,,,,,java-ii,java-ii/object-oriented-programming,slides/object_oriented_programming
user_id,int64,0,0.0,53,73739.0,30.566213,35.540653,1.0,11.0,12.0,53.0,139.0,1,1,1
cohort_id,float64,1334,1.809083,10,72405.0,21.817374,6.708243,1.0,22.0,22.0,28.0,28.0,8.0,8.0,8.0
ip,object,0,0.0,383,73739.0,,,,,,,,97.105.19.61,97.105.19.61,97.105.19.61
id,int64,0,0.0,53,73739.0,30.566213,35.540653,1.0,11.0,12.0,53.0,139.0,1,1,1
name,object,0,0.0,53,73739.0,,,,,,,,Arches,Arches,Arches
slack,object,0,0.0,52,73739.0,,,,,,,,#arches,#arches,#arches
start_date,object,0,0.0,49,73739.0,,,,,,,,2014-02-04,2014-02-04,2014-02-04


In [16]:
col_to_remove = ['id','slack','deleted_at']

In [17]:
def remove_columns(df, col_to_remove):
    """
    This function will:
    - take in a df and list of columns (you need to create a list of columns that you would like to drop under the name 'cols_to_remove')
    - drop the listed columns
    - return the new df
    """
    df = df.drop(columns=col_to_remove)
    
    return df

In [18]:
def handle_missing_values(df, prop_required_columns=0.5, prop_required_rows=0.75):
    """
    This function will:
    - take in: 
        - a dataframe
        - column threshold (defaulted to 0.5)
        - row threshold (defaulted to 0.75)
    - calculates the minimum number of non-missing values required for each column/row to be retained
    - drops columns/rows with a high proportion of missing values.
    - returns the new df
    """
    
    column_threshold = int(round(prop_required_columns * len(df.index), 0))
    df = df.dropna(axis=1, thresh=column_threshold)
    
    row_threshold = int(round(prop_required_rows * len(df.columns), 0))
    df = df.dropna(axis=0, thresh=row_threshold)
    
    return df

In [19]:
def data_prep(df, col_to_remove, prop_required_columns=0.5, prop_required_rows=0.75):
    """
    This function will:
    - take in: 
        - a dataframe
        - list of columns
        - column threshold (defaulted to 0.5)
        - row threshold (defaulted to 0.75)
    - removes unwanted columns
    - remove rows and columns that contain a high proportion of missing values
    - returns cleaned df
    """
    df = remove_columns(df, col_to_remove)
    df = handle_missing_values(df, prop_required_columns, prop_required_rows)
    return df

In [20]:
df = data_prep(df, col_to_remove, prop_required_columns=0.5, prop_required_rows=0.75)

In [21]:
data_summary(df)

data shape: (73739, 12)


Unnamed: 0,data type,#missing,%missing,#unique,count,mean,std,min,25%,50%,75%,max,first_value,second_value,third_value
date,object,0,0.0,1050,73739.0,,,,,,,,2018-01-26,2018-01-26,2018-01-26
time,object,0,0.0,35270,73739.0,,,,,,,,09:56:02,09:56:05,09:56:06
path,object,0,0.0,1442,73739.0,,,,,,,,java-ii,java-ii/object-oriented-programming,slides/object_oriented_programming
user_id,int64,0,0.0,53,73739.0,30.566213,35.540653,1.0,11.0,12.0,53.0,139.0,1,1,1
cohort_id,float64,1334,1.809083,10,72405.0,21.817374,6.708243,1.0,22.0,22.0,28.0,28.0,8.0,8.0,8.0
ip,object,0,0.0,383,73739.0,,,,,,,,97.105.19.61,97.105.19.61,97.105.19.61
name,object,0,0.0,53,73739.0,,,,,,,,Arches,Arches,Arches
start_date,object,0,0.0,49,73739.0,,,,,,,,2014-02-04,2014-02-04,2014-02-04
end_date,object,0,0.0,50,73739.0,,,,,,,,2014-04-22,2014-04-22,2014-04-22
created_at,object,0,0.0,41,73739.0,,,,,,,,2016-06-14 19:52:26,2016-06-14 19:52:26,2016-06-14 19:52:26


In [22]:
def nulls_by_col(df):
    """
    This function will:
        - take in a dataframe
        - assign a variable to a Series of total row nulls for ea/column
        - assign a variable to find the percent of rows w/nulls
        - output a df of the two variables.
    """
    num_missing = df.isnull().sum()
    pct_miss = (num_missing / df.shape[0]) * 100
    cols_missing = pd.DataFrame({
                    'num_rows_missing': num_missing,
                    'percent_rows_missing': pct_miss
                    })
    
    return cols_missing

In [23]:
nulls_by_col(df)

Unnamed: 0,num_rows_missing,percent_rows_missing
date,0,0.0
time,0,0.0
path,0,0.0
user_id,0,0.0
cohort_id,1334,1.809083
ip,0,0.0
name,0,0.0
start_date,0,0.0
end_date,0,0.0
created_at,0,0.0


In [24]:
def nulls_by_row(df, index_id='customer_id'):
    num_missing = df.isnull().sum(axis=1)
    pct_miss = (num_missing / df.shape[1]) * 100
    row_missing = num_missing.value_counts().sort_index()

    rows_missing = pd.DataFrame({
        'num_cols_missing': num_missing,
        'percent_cols_missing': pct_miss,
        'num_rows': row_missing
    }).reset_index()

    result_df = df.merge(rows_missing, left_index=True, right_on='index').drop('index', axis=1)[['num_cols_missing', 'percent_cols_missing', 'num_rows']]

    return result_df

In [25]:
nulls_by_row(df, index_id='cohort_id')

Unnamed: 0,num_cols_missing,percent_cols_missing,num_rows
0,0,0.0,72405.0
1,0,0.0,1334.0
2,0,0.0,
3,0,0.0,
4,0,0.0,
...,...,...,...
73734,0,0.0,
73735,0,0.0,
73736,0,0.0,
73737,0,0.0,


In [26]:
def get_data(df):
    df = pd.read_csv('anonymized-curriculum-access.txt', sep=" ")
    df_2 = pd.read_csv('cohorts.csv')
    df_2 = df_2.drop(columns ='Unnamed: 0')
    df.loc[len(df.index)] = ['2018-01-26', '09:55:03', '/', 1, 8 , '97.105.19.61']
    df.columns = ['date', 'time', 'path', 'user_id', 'cohort_id', 'ip']

    df = df.merge(df_2, left_on='user_id', right_on='id')
    df = df.drop(columns = 'id')
    df = df.drop(columns = 'deleted_at')
    df = df.drop(columns = 'slack')
    df['date'] = pd.to_datetime( df['date'])
    df['time'] = pd.to_datetime( df['time'])
    df['start_date'] = pd.to_datetime( df['start_date'])
    df['end_date'] = pd.to_datetime( df['end_date'])
    df['created_at'] = pd.to_datetime( df['created_at'])
    df['updated_at'] = pd.to_datetime( df['updated_at'])
    return df

In [27]:
df = get_data(df)

In [28]:
data_summary(df)

data shape: (73739, 12)


Unnamed: 0,data type,#missing,%missing,#unique,count,mean,std,min,25%,50%,75%,max,first_value,second_value,third_value
date,datetime64[ns],0,0.0,1050,73739.0,,,,,,,,2018-01-26 00:00:00,2018-01-26 00:00:00,2018-01-26 00:00:00
time,datetime64[ns],0,0.0,35270,73739.0,,,,,,,,2023-06-14 09:56:02,2023-06-14 09:56:05,2023-06-14 09:56:06
path,object,0,0.0,1442,73739.0,,,,,,,,java-ii,java-ii/object-oriented-programming,slides/object_oriented_programming
user_id,int64,0,0.0,53,73739.0,30.566213,35.540653,1.0,11.0,12.0,53.0,139.0,1,1,1
cohort_id,float64,1334,1.809083,10,72405.0,21.817374,6.708243,1.0,22.0,22.0,28.0,28.0,8.0,8.0,8.0
ip,object,0,0.0,383,73739.0,,,,,,,,97.105.19.61,97.105.19.61,97.105.19.61
name,object,0,0.0,53,73739.0,,,,,,,,Arches,Arches,Arches
start_date,datetime64[ns],0,0.0,49,73739.0,,,,,,,,2014-02-04 00:00:00,2014-02-04 00:00:00,2014-02-04 00:00:00
end_date,datetime64[ns],0,0.0,50,73739.0,,,,,,,,2014-04-22 00:00:00,2014-04-22 00:00:00,2014-04-22 00:00:00
created_at,datetime64[ns],0,0.0,41,73739.0,,,,,,,,2016-06-14 19:52:26,2016-06-14 19:52:26,2016-06-14 19:52:26


In [29]:
df

Unnamed: 0,date,time,path,user_id,cohort_id,ip,name,start_date,end_date,created_at,updated_at,program_id
0,2018-01-26,2023-06-14 09:56:02,java-ii,1,8.0,97.105.19.61,Arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,1
1,2018-01-26,2023-06-14 09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61,Arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,1
2,2018-01-26,2023-06-14 09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61,Arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,1
3,2018-01-26,2023-06-14 10:40:15,javascript-i/functions,1,8.0,97.105.19.61,Arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,1
4,2018-01-26,2023-06-14 11:26:13,java-i,1,8.0,97.105.19.61,Arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,1
...,...,...,...,...,...,...,...,...,...,...,...,...
73734,2021-02-26,2023-06-14 12:04:25,content/gitbook/images/favicon.ico,139,14.0,174.25.169.61,Oberon,2021-04-12,2021-10-01,2021-04-12 18:07:21,2021-04-12 18:07:21,2
73735,2021-02-26,2023-06-14 12:04:52,content/html-css/elements.html,139,14.0,174.25.169.61,Oberon,2021-04-12,2021-10-01,2021-04-12 18:07:21,2021-04-12 18:07:21,2
73736,2021-02-26,2023-06-14 12:05:13,content/html-css/css-i,139,14.0,174.25.169.61,Oberon,2021-04-12,2021-10-01,2021-04-12 18:07:21,2021-04-12 18:07:21,2
73737,2021-02-26,2023-06-14 12:05:18,content/html-css/css-ii,139,14.0,174.25.169.61,Oberon,2021-04-12,2021-10-01,2021-04-12 18:07:21,2021-04-12 18:07:21,2


In [30]:
df.groupby(by=["cohort_id", 'path']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,user_id,program_id
cohort_id,path,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,/,6507,367
1.0,appendix,658,46
1.0,appendix/angular,11,1
1.0,appendix/capstone-workbook,110,10
1.0,appendix/documentation,22,2
...,...,...,...
28.0,web-design,193,10
28.0,web-design/intro,182,9
28.0,web-design/ui/color,11,1
28.0,web-design/ui/typography,75,4
