# Exploratory notebook

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import os
import urllib.parse
import gzip
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.stats import chi2_contingency
import explore_ms as e

from io import BytesIO
from io import StringIO
from sqlalchemy import create_engine

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from IPython.display import display, display_html
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

# ACQUIRE

In [2]:
logs_df = e.get_sql_data()

In [3]:
logs_df.head().T

Unnamed: 0,0,1,2,3,4
lesson,/,java-ii,java-ii/object-oriented-programming,slides/object_oriented_programming,javascript-i/conditionals
user_id,1,1,1,1,2
name,Hampton,Hampton,Hampton,Hampton,Teddy
program_id,1,1,1,1,2
ip,97.105.19.61,97.105.19.61,97.105.19.61,97.105.19.61,97.105.19.61
start_date,2015-09-22,2015-09-22,2015-09-22,2015-09-22,2018-01-08
end_date,2016-02-06,2016-02-06,2016-02-06,2016-02-06,2018-05-17


In [4]:
logs_df.describe()


Unnamed: 0,user_id,program_id
count,847330.0,847330.0
mean,456.707344,2.086004
std,250.734201,0.388231
min,1.0,1.0
25%,263.0,2.0
50%,476.0,2.0
75%,648.0,2.0
max,981.0,4.0


In [5]:
logs_df.columns.tolist()


['lesson', 'user_id', 'name', 'program_id', 'ip', 'start_date', 'end_date']

In [6]:
logs_df.isnull().sum()


lesson        1
user_id       0
name          0
program_id    0
ip            0
start_date    0
end_date      0
dtype: int64

# PREPARE

In [7]:
def prep_logs(logs_df):
    """
    Preprocesses the given DataFrame by dropping unnecessary columns, renaming columns,
    mapping program IDs to program names, ensuring cohort_id is of integer type,
    and extracting lesson and endpoint from path.
    """
    logs_df = logs_df.copy()  # Create a copy to avoid modifying the input DataFrame directly
    
    # column_rename_mapping = {'name': 'cohort', 'created_at': 'created', 'updated_at': 'updated'}
    # logs_df = logs_df.rename(columns=column_rename_mapping)
    
    program_mapping = {1: 'web dev', 2: 'web dev', 3: 'data science', 4: 'frontend'}
    logs_df['program'] = logs_df['program_id'].map(program_mapping)
    

    logs_df['lesson'] = logs_df['path'].str.split('/').str[-2]
    logs_df['endpoint'] = logs_df['path'].str.split('/').str[-1]
    
    return logs_df


In [8]:
logs_df = prep_logs(logs_df)


KeyError: 'path'

# 1. Which lesson appears to attract the most traffic consistently across cohorts (per program)?

In [None]:
logs_df['lesson'].value_counts().head()

In [None]:
logs_df.program.value_counts()


In [None]:
data_series = logs_df[(logs_df['program'] == 'data science') & 
                      (logs_df['cohort'] != 'Staff') & 
                      (logs_df['lesson'] != '')].lesson.value_counts().head(7)[2:6]


In [None]:
web_dev_series = logs_df[(logs_df['program'] == 'web dev') & 
                         (logs_df['cohort'] != 'Staff') & 
                         (logs_df['lesson'] != '')].lesson.value_counts().head(5)


In [None]:
front_series = logs_df[(logs_df['program'] == 'frontend') & 
                       (logs_df['cohort'] != 'Staff') & 
                       (logs_df['lesson'] != '')].lesson.value_counts().head(7)[2:6]


In [None]:
# per lesson, per program, per cohort
data_fundies_cohort = logs_df[(logs_df['lesson'] == 'fundamentals') & 
                              (logs_df['program'] == 'data science') & 
                              (logs_df['cohort'] != 'Staff')].cohort.value_counts().head(5)
data_fundies_cohort


In [None]:
# per lesson, per program, per cohort
web_java_cohort = logs_df[(logs_df['lesson'] == 'mysql') & 
                          (logs_df['program'] == 'web dev')  & 
                          (logs_df['cohort'] != 'Staff')].cohort.value_counts().head(10)
web_java_cohort


In [None]:
# per lesson, per program, per cohort
front_images_cohort = logs_df[(logs_df['lesson'] == 'images') & 
                              (logs_df['program'] == 'frontend') & 
                              (logs_df['cohort'] != 'Staff')].cohort.value_counts()
front_images_cohort


# EXPLORE 