# Explore

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import wrangle_victoire_
import env

import warnings
warnings.filterwarnings("ignore")

sns.set_theme(style="whitegrid")

**Get data**

In [2]:
# load cleaned data
codeup = wrangle_victoire_.wrangle_codeup()
codeup.head()

Unnamed: 0,id,name,slack,start_date,end_date,created_at,updated_at,deleted_at,program_id,date,time,path,user_id,cohort_id,ip
0,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61
1,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61
2,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61
3,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61
4,22.0,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,,2.0,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61


### 1. Which lesson appears to attract the most traffic consistently across cohorts (per program)?

In [3]:
# drop nulls in the path column
codeup = codeup[~codeup.path.isna()]
# remove bad urls from path
codeup = codeup[~codeup.path.isin(["/"])]

In [5]:
# not doing this because the files are located in a main path which I now consider the lesson
# # remove all rows that contains file path ending because they are not lessons
# file_type = "|".join([".jpg", ".jpeg", ".svg", ".json", ".ico", ".html"])
# codeup = codeup[~codeup["path"].str.contains(file_type, case=False)]

In [6]:
# create a lesson column
codeup["lesson"] = codeup.path.str.extract('^(.*?)(?=\/)')

**solution**

In [7]:
#  goup the data frame
lesson_counts = codeup.groupby(["program_id","cohort_id","lesson"]).date.agg(["count"]).sort_values(by=["program_id"], ascending=False)

lesson_counts

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
program_id,cohort_id,lesson,Unnamed: 3_level_1
4.0,9.0,content,4
3.0,59.0,1-fundamentals,2401
3.0,55.0,A-clustering,1
3.0,55.0,9-timeseries,591
3.0,55.0,8-clustering,746
...,...,...,...
1.0,14.0,examples,122
1.0,14.0,content,1128
1.0,14.0,appendix,231
1.0,14.0,6-regression,1


In [8]:
# Reset the index and create columns for group keys
lesson_counts = lesson_counts.reset_index()
lesson_counts

Unnamed: 0,program_id,cohort_id,lesson,count
0,4.0,9.0,content,4
1,3.0,59.0,1-fundamentals,2401
2,3.0,55.0,A-clustering,1
3,3.0,55.0,9-timeseries,591
4,3.0,55.0,8-clustering,746
...,...,...,...,...
967,1.0,14.0,examples,122
968,1.0,14.0,content,1128
969,1.0,14.0,appendix,231
970,1.0,14.0,6-regression,1


In [16]:
# find name of highly visited lesson
lesson_counts = lesson_counts.groupby(["program_id","cohort_id"])["lesson","count"].max()
lesson_counts

Unnamed: 0_level_0,Unnamed: 1_level_0,lesson,count
program_id,cohort_id,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,1.0,uploads,1039
1.0,2.0,prework,57
1.0,4.0,prework,2
1.0,6.0,spring,11
1.0,7.0,spring,245
1.0,8.0,uploads,155
1.0,11.0,search,79
1.0,12.0,prework,258
1.0,13.0,spring,2254
1.0,14.0,web-design,1128


### 2. Is there a cohort that referred to a lesson significantly more than other cohorts seemed to gloss over?

In [10]:
# which lesson was most visited overall
lesson_counts.lesson.value_counts()

web-design    26
spring         7
timeseries     5
prework        3
uploads        2
search         2
content        1
Name: lesson, dtype: int64

In [11]:
# which cohort visited web-design
web_design = lesson_counts[lesson_counts.lesson == "web-design"]
web_design

Unnamed: 0_level_0,Unnamed: 1_level_0,lesson,count
program_id,cohort_id,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,14.0,web-design,1128
1.0,17.0,web-design,1359
2.0,18.0,web-design,187
2.0,21.0,web-design,1408
2.0,22.0,web-design,3787
2.0,23.0,web-design,3747
2.0,24.0,web-design,4117
2.0,25.0,web-design,4074
2.0,26.0,web-design,3567
2.0,27.0,web-design,2797


In [12]:
# reset dataframe index
web_design = web_design.reset_index()
web_design.head(3)

Unnamed: 0,program_id,cohort_id,lesson,count
0,1.0,14.0,web-design,1128
1,1.0,17.0,web-design,1359


In [30]:
# get min and max cohort counts
web_design[(web_design["count"] == web_design["count"].max()) | (web_design["count"] == web_design["count"].min())]



Unnamed: 0,program_id,cohort_id,lesson,count
2,2.0,18.0,web-design,187
14,2.0,33.0,web-design,5991
