# Exploratory notebook

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import os
import urllib.parse
import gzip
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.stats import chi2_contingency
import explore_ms as e

from io import BytesIO
from io import StringIO
from sqlalchemy import create_engine

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from IPython.display import display, display_html
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

# ACQUIRE

In [2]:
logs_df = e.get_sql_data()

In [3]:
logs_df.head().T

Unnamed: 0,0,1,2,3,4
lesson,/,java-ii,java-ii/object-oriented-programming,slides/object_oriented_programming,javascript-i/conditionals
user_id,1,1,1,1,2
cohort,Hampton,Hampton,Hampton,Hampton,Teddy
program_id,1,1,1,1,2
ip,97.105.19.61,97.105.19.61,97.105.19.61,97.105.19.61,97.105.19.61
start_date,2015-09-22,2015-09-22,2015-09-22,2015-09-22,2018-01-08
end_date,2016-02-06,2016-02-06,2016-02-06,2016-02-06,2018-05-17


In [4]:
logs_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 847330 entries, 0 to 847329
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   lesson      847329 non-null  object
 1   user_id     847330 non-null  int64 
 2   cohort      847330 non-null  object
 3   program_id  847330 non-null  int64 
 4   ip          847330 non-null  object
 5   start_date  847330 non-null  object
 6   end_date    847330 non-null  object
dtypes: int64(2), object(5)
memory usage: 45.3+ MB


In [5]:
logs_df.columns.tolist()


['lesson', 'user_id', 'cohort', 'program_id', 'ip', 'start_date', 'end_date']

In [6]:
logs_df.shape

(847330, 7)

In [7]:
logs_df.describe()


Unnamed: 0,user_id,program_id
count,847330.0,847330.0
mean,456.707344,2.086004
std,250.734201,0.388231
min,1.0,1.0
25%,263.0,2.0
50%,476.0,2.0
75%,648.0,2.0
max,981.0,4.0


In [8]:
logs_df.isnull().sum()


lesson        1
user_id       0
cohort        0
program_id    0
ip            0
start_date    0
end_date      0
dtype: int64

# 1. Which lesson appears to attract the most traffic consistently across cohorts (per program)?

In [9]:
logs_df['lesson'].value_counts().head()

/                           45854
javascript-i                18203
toc                         17591
search/search_index.json    17534
java-iii                    13166
Name: lesson, dtype: int64

In [10]:
logs_df['cohort'].value_counts().head()


Staff      84031
Ceres      40730
Zion       38096
Jupiter    37109
Fortuna    36902
Name: cohort, dtype: int64

In [11]:
logs_df['program_id'].value_counts().head()


2    713365
3    103412
1     30548
4         5
Name: program_id, dtype: int64

In [24]:
# Group the DataFrame by program_id, lesson, and cohort
grouped_traffic = logs_df.groupby(['program_id', 'lesson', 'cohort'])

# Count the number of times each lesson was viewed in each cohort
traffic_df = grouped_traffic.size().reset_index(name='count')

# Sort the DataFrame by program_id, lesson, and count
traffic_df = traffic_df.sort_values(by=['program_id', 'lesson', 'count'], ascending=False)


In [31]:
# Print the top 5 rows of the DataFrame
traffic_df.head()

Unnamed: 0,program_id,lesson,cohort,count
13564,4,content/html-css/introduction.html,Apollo,1
13563,4,content/html-css/gitbook/images/favicon.ico,Apollo,1
13562,4,content/html-css,Apollo,2
13561,4,/,Apollo,1
13560,3,working-with-time-series-data,Darden,14


# EXPLORE 

In [None]:

# Group the DataFrame by program_id, lesson, and cohort
grouped_traffic = logs_df.groupby(['program_id', 'lesson', 'cohort'])

# Count the number of times each lesson was viewed in each cohort
traffic_df = grouped_traffic.size().reset_index(name='count')

# Sort the DataFrame by program_id, lesson, and count
traffic_df = traffic_df.sort_values(by=['program_id', 'lesson', 'count'], ascending=False)

# Get unique lessons and programs
unique_lessons = traffic_df['lesson'].unique()
unique_programs = traffic_df['program_id'].unique()

# Create histograms for each lesson's traffic count
for lesson in unique_lessons:
    plt.figure(figsize=(8, 4))
    for program in unique_programs:
        lesson_traffic = traffic_df[(traffic_df['lesson'] == lesson) & (traffic_df['program_id'] == program)]
        plt.hist(lesson_traffic['count'], bins=10, alpha=0.5, label=f'Program {program}')
    
    plt.xlabel('Traffic Count')
    plt.ylabel('Frequency')
    plt.title(f'Histogram of Traffic Count for Lesson: {lesson}')
    plt.legend()
    plt.show()
    
# Print the top 5 rows of the DataFrame
traffic_df.head()