# Exploratory notebook

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import os
import urllib.parse
import gzip
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.stats import chi2_contingency
import explore_ms as e
import env

from io import BytesIO
from io import StringIO
from sqlalchemy import create_engine

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from IPython.display import display, display_html
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

# ACQUIRE

In [2]:
logs_df = e.get_sql_data()

In [3]:
logs_df.head().T

date,2018-01-26 09:55:03,2018-01-26 09:56:02,2018-01-26 09:56:05,2018-01-26 09:56:06,2018-01-26 09:56:24
lesson,/,java-ii,java-ii/object-oriented-programming,slides/object_oriented_programming,javascript-i/conditionals
user_id,1,1,1,1,2
cohort,Hampton,Hampton,Hampton,Hampton,Teddy
program_id,1,1,1,1,2
ip,97.105.19.61,97.105.19.61,97.105.19.61,97.105.19.61,97.105.19.61
start_date,2015-09-22,2015-09-22,2015-09-22,2015-09-22,2018-01-08
end_date,2016-02-06,2016-02-06,2016-02-06,2016-02-06,2018-05-17
program,web dev,web dev,web dev,web dev,web dev


In [4]:
logs_df.tail().T

date,2021-04-21 16:41:51,2021-04-21 16:42:02,2021-04-21 16:42:09,2021-04-21 16:44:37,2021-04-21 16:44:39
lesson,jquery/personal-site,jquery/mapbox-api,jquery/ajax/weather-map,anomaly-detection/discrete-probabilistic-methods,jquery/mapbox-api
user_id,64,64,64,744,64
cohort,Staff,Staff,Staff,Staff,Staff
program_id,2,2,2,2,2
ip,71.150.217.33,71.150.217.33,71.150.217.33,24.160.137.86,71.150.217.33
start_date,2014-02-04,2014-02-04,2014-02-04,2014-02-04,2014-02-04
end_date,2014-02-04,2014-02-04,2014-02-04,2014-02-04,2014-02-04
program,web dev,web dev,web dev,web dev,web dev


In [5]:
logs_df.info()


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 847330 entries, 2018-01-26 09:55:03 to 2021-04-21 16:44:39
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   lesson      847329 non-null  object
 1   user_id     847330 non-null  int64 
 2   cohort      847330 non-null  object
 3   program_id  847330 non-null  int64 
 4   ip          847330 non-null  object
 5   start_date  847330 non-null  object
 6   end_date    847330 non-null  object
 7   program     847330 non-null  object
dtypes: int64(2), object(6)
memory usage: 58.2+ MB


In [6]:
logs_df.columns.tolist()


['lesson',
 'user_id',
 'cohort',
 'program_id',
 'ip',
 'start_date',
 'end_date',
 'program']

In [7]:
logs_df.shape

(847330, 8)

In [8]:
logs_df.describe()


Unnamed: 0,user_id,program_id
count,847330.0,847330.0
mean,456.707344,2.086004
std,250.734201,0.388231
min,1.0,1.0
25%,263.0,2.0
50%,476.0,2.0
75%,648.0,2.0
max,981.0,4.0


In [9]:
logs_df.isnull().sum()


lesson        1
user_id       0
cohort        0
program_id    0
ip            0
start_date    0
end_date      0
program       0
dtype: int64

# 1. Which lesson appears to attract the most traffic consistently across cohorts (per program)?

In [10]:
lesson_names = logs_df['lesson'].unique()
lesson_names

array(['/', 'java-ii', 'java-ii/object-oriented-programming', ...,
       'florence-python-assessment.html', 'javascript-i/dom',
       'appendix/professional-development/post-interview-review-form'],
      dtype=object)

In [11]:
logs_df['lesson'].value_counts()

/                                                               45854
javascript-i                                                    18203
toc                                                             17591
search/search_index.json                                        17534
java-iii                                                        13166
                                                                ...  
content/examples/javascript/primitive-types.html                    1
content/examples/javascript/conditionals.html                       1
2-storytelling/1-overview/www.qlik.com                              1
syntax-types-and-variables                                          1
appendix/professional-development/post-interview-review-form        1
Name: lesson, Length: 2224, dtype: int64

In [12]:
logs_df['cohort'].value_counts().head()


Staff      84031
Ceres      40730
Zion       38096
Jupiter    37109
Fortuna    36902
Name: cohort, dtype: int64

In [13]:
logs_df['program'].value_counts().head()


web dev         743913
data science    103412
frontend             5
Name: program, dtype: int64

In [15]:
# Group the DataFrame by program_id, lesson, and name, and count the occurrences
grouped_traffic = logs_df.groupby(['program', 'lesson', 'cohort']) \
                    .size().reset_index(name='count')

# Sort the grouped DataFrame by count in descending order
top_lessons = grouped_traffic.sort_values('count', ascending=False)

# Filter the top lessons for program_id 1, excluding lessons like '/', 'appendix', 'index.html'
top_lessons_program_1 = top_lessons[
    (top_lessons['program'] == 1) &
    (~top_lessons['lesson'].isin(['/', 'appendix', 'index.html']))
].head(10)

In [16]:
logs_df

Unnamed: 0_level_0,lesson,user_id,cohort,program_id,ip,start_date,end_date,program
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-01-26 09:55:03,/,1,Hampton,1,97.105.19.61,2015-09-22,2016-02-06,web dev
2018-01-26 09:56:02,java-ii,1,Hampton,1,97.105.19.61,2015-09-22,2016-02-06,web dev
2018-01-26 09:56:05,java-ii/object-oriented-programming,1,Hampton,1,97.105.19.61,2015-09-22,2016-02-06,web dev
2018-01-26 09:56:06,slides/object_oriented_programming,1,Hampton,1,97.105.19.61,2015-09-22,2016-02-06,web dev
2018-01-26 09:56:24,javascript-i/conditionals,2,Teddy,2,97.105.19.61,2018-01-08,2018-05-17,web dev
...,...,...,...,...,...,...,...,...
2021-04-21 16:41:51,jquery/personal-site,64,Staff,2,71.150.217.33,2014-02-04,2014-02-04,web dev
2021-04-21 16:42:02,jquery/mapbox-api,64,Staff,2,71.150.217.33,2014-02-04,2014-02-04,web dev
2021-04-21 16:42:09,jquery/ajax/weather-map,64,Staff,2,71.150.217.33,2014-02-04,2014-02-04,web dev
2021-04-21 16:44:37,anomaly-detection/discrete-probabilistic-methods,744,Staff,2,24.160.137.86,2014-02-04,2014-02-04,web dev


In [None]:
# # Define a function to get top lessons by program_id
# def get_top_lessons_by_program(logs_df, program):
#     grouped_traffic = logs_df.groupby(['program', 'lesson', 'cohort']) \
#                         .size().reset_index(name='count')
#     top_lessons = grouped_traffic.sort_values('count', ascending=False)
    
#     return top_lessons[
#         (top_lessons['program'] == program) &
#         (~top_lessons['lesson'].isin(['/', 'appendix', 'index.html']))].head(10)

In [18]:
# Call the function to get top lessons for program
top_lessons_program_wd = e.get_top_lessons_by_program(logs_df, program='web dev')
"Top Lessons for Program wd:"
top_lessons_program_wd

Unnamed: 0,program,lesson,cohort,count
13342,web dev,toc,Jupiter,1866
9532,web dev,javascript-i,Staff,1817
12167,web dev,search/search_index.json,Apex,1497
13363,web dev,toc,Zion,1465
12754,web dev,spring,Staff,1403
9155,web dev,java-iii,Staff,1393
12171,web dev,search/search_index.json,Ceres,1380
12194,web dev,search/search_index.json,Staff,1349
13335,web dev,toc,Fortuna,1293
7845,web dev,html-css,Staff,1284


In [19]:
# Call the function to get top lessons for program
top_lessons_program_ds = e.get_top_lessons_by_program(logs_df, program='data science')
"Top Lessons for Program ds:"
top_lessons_program_ds

Unnamed: 0,program,lesson,cohort,count
932,data science,classification/overview,Darden,1109
954,data science,classification/scale_features_or_not.svg,Darden,943
1549,data science,sql/mysql-overview,Darden,774
1468,data science,search/search_index.json,Darden,664
59,data science,1-fundamentals/modern-data-scientist.jpg,Bayes,650
51,data science,1-fundamentals/AI-ML-DL-timeline.jpg,Bayes,648
9,data science,1-fundamentals/1.1-intro-to-data-science,Bayes,640
1171,data science,fundamentals/modern-data-scientist.jpg,Florence,627
1078,data science,fundamentals/AI-ML-DL-timeline.jpg,Florence,624
1160,data science,fundamentals/intro-to-data-science,Florence,615


In [20]:
# Call the function to get top lessons for program
top_lessons_program_fe = e.get_top_lessons_by_program(logs_df, program='frontend')
"Top Lessons for Program fe:"
top_lessons_program_fe

Unnamed: 0,program,lesson,cohort,count
1764,frontend,content/html-css,Apollo,2
1765,frontend,content/html-css/gitbook/images/favicon.ico,Apollo,1
1766,frontend,content/html-css/introduction.html,Apollo,1


# EXPLORE 

In [21]:
def create_histograms(traffic_df, program_id):
    # Define a custom color palette
    color_palette = ['#FF4500', '#FF6347', '#D2691E', '#F4A460', '#8B4513', '#BDB76B', '#2F4F4F']
    

    
    # Get the unique lessons for the program
    unique_lessons = program_id_traffic['lesson'].unique()
    
    # Create subplots for each lesson's histogram
    num_cols = 3  # Number of columns for subplots
    num_rows = (len(unique_lessons) + num_cols - 1) // num_cols  # Number of rows for subplots
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 10*num_rows))
    
    # Iterate through unique lessons and plot histograms
    for idx, lesson in enumerate(unique_lessons):
        row = idx // num_cols
        col = idx % num_cols
        ax = axes[row, col]
        
        lesson_traffic = program_id_traffic[program_id_traffic['lesson'] == lesson]
        ax.hist(lesson_traffic['count'], bins=20, edgecolor='black', color=color_palette[idx % len(color_palette)])
        ax.set_title(f'Traffic Histogram for Lesson: {lesson} ({program_name})')
        ax.set_xlabel('Traffic Count')
        ax.set_ylabel('Frequency')
    
    plt.tight_layout()

# # Get unique program IDs
# unique_program_ids = traffic_df['program_id'].unique()

# Loop through program IDs and plot histograms for each program
for program_id in unique_program_ids:
    create_histograms(traffic_df, program_id)
    plt.show()  # Show each set of histograms before moving to the next program



NameError: name 'unique_program_ids' is not defined