# Exploratory notebook

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import os
import urllib.parse
import gzip
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.stats import chi2_contingency


from io import BytesIO
from io import StringIO


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from IPython.display import display, display_html
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

# ACQUIRE

In [14]:
# Replace this with the actual URL-style file path to your compressed text file
file_path = "file:///Users/miattas/Downloads/anonymized-curriculum-access.txt.gz"

# Convert URL-style file path to standard file path
parsed_path = urllib.parse.urlparse(file_path)
decoded_path = urllib.parse.unquote(parsed_path.path)
standard_path = os.path.abspath(decoded_path)

# Decompress the .gz file and read its content
with gzip.open(standard_path, "rb") as gz_file:
    data = gz_file.read().decode("utf-8")

# Preprocess lines with two consecutive spaces
data = data.replace("  ", " None ")

# Create a DataFrame from the preprocessed data
df = pd.read_csv(StringIO(data), sep=" ", header=None,
                 names=["date", "time", "lesson", "user_id", "cohort_id", "ip_address"])

# Convert 'date' and 'time' columns to a single datetime column
df['date'] = pd.to_datetime(df['date'] + ' ' + df['time'])

# Drop the 'time' column if it's no longer needed
df = df.drop(['time'], axis=1)

# SQL query to join 'logs' and 'cohorts' tables
sql = """
SELECT l.date, l.time, l.path as lesson, l.user_id, c.name,
       l.ip, c.start_date, c.end_date
FROM logs l
JOIN cohorts c ON c.id=l.cohort_id;
"""
# Now you can perform exploratory data analysis on the 'df' DataFrame
# For example, let's display the shape and the first few rows
print(df.shape)
print(df.info)
print(df.head().T)

(900223, 5)
<bound method DataFrame.info of                       date                                            lesson  \
0      2018-01-26 09:55:03                                                 /   
1      2018-01-26 09:56:02                                           java-ii   
2      2018-01-26 09:56:05               java-ii/object-oriented-programming   
3      2018-01-26 09:56:06                slides/object_oriented_programming   
4      2018-01-26 09:56:24                         javascript-i/conditionals   
...                    ...                                               ...   
900218 2021-04-21 16:41:51                              jquery/personal-site   
900219 2021-04-21 16:42:02                                 jquery/mapbox-api   
900220 2021-04-21 16:42:09                           jquery/ajax/weather-map   
900221 2021-04-21 16:44:37  anomaly-detection/discrete-probabilistic-methods   
900222 2021-04-21 16:44:39                                 jquery/mapbox-api

In [8]:
df.columns

Index(['date', 'lesson', 'user_id', 'cohort_id', 'ip_address'], dtype='object')

In [9]:
df['lesson'].value_counts()


/                                                               50313
search/search_index.json                                        19519
javascript-i                                                    18983
toc                                                             18297
java-iii                                                        13733
                                                                ...  
12-distributed-ml/6.1-prepare-part-1                                1
12-distributed-ml/6.4-prepare-part-4                                1
4-python                                                            1
4-python/overview                                                   1
appendix/professional-development/post-interview-review-form        1
Name: lesson, Length: 2314, dtype: int64