In [1]:
import pandas as pd

In [None]:
import env
import os

def get_connection(db, user=env.user, host=env.host, password=env.pwd):
    return f'mysql+pymysql://{user}:{password}@{host}/{db}'
    
def get_log_data():
    '''
    If the csv file exists, it is read and returned as a pandas DataFrame
    If not, pandas reads in a SQL query that acquires log data from a MySQL database.
    The query is stored into a DataFrame, saved, and returned.
    '''
    filename = 'logs.csv'
    
    if os.path.isfile(filename):
        return pd.read_csv(filename)
    
    df = pd.read_sql(sql_query="""SELECT l.date, l.time,
                                            l.path as lesson, 
                                            l.user_id, c.name,
                                            l.ip, c.start_date,
                                            c.end_date, c.program_id
                                    FROM logs l
                                    JOIN cohorts c ON c.id=l.cohort_id;""",
                     get_connection(db='curriculum_logs'))
    
    # Assuming 'data' is your DataFrame with the provided data
    df['date'] = pd.to_datetime(df['date'] + ' ' + df['time'])

    # Drop the 'date' and 'time' columns if they're no longer needed
    df = df.drop(['time'], axis=1)
    
    df = df.set_index('date')
    
    df.to_csv(filename)
    
    return df

df = get_sql_data()

In [36]:
df['program_id'].unique()

array([1, 2, 4, 3])

In [41]:
df[df['program_id'] == 1]['name'].unique()

array(['Hampton', 'Arches', 'Quincy', 'Kings', 'Lassen', 'Glacier',
       'Denali', 'Joshua', 'Olympic', 'Badlands', 'Ike', 'Franklin',
       'Everglades'], dtype=object)

In [42]:
df[df['program_id'] == 2]['name'].unique()

array(['Teddy', 'Sequoia', 'Niagara', 'Pinnacles', 'Mammoth', 'Ulysses',
       'Voyageurs', 'Wrangell', 'Xanadu', 'Yosemite', 'Staff', 'Zion',
       'Andromeda', 'Betelgeuse', 'Ceres', 'Deimos', 'Europa', 'Fortuna',
       'Apex', 'Ganymede', 'Hyperion', 'Bash', 'Jupiter', 'Kalypso',
       'Luna', 'Marco', 'Neptune', 'Oberon'], dtype=object)

In [43]:
df[df['program_id'] == 3]['name'].unique()

array(['Bayes', 'Curie', 'Darden', 'Easley', 'Florence'], dtype=object)

In [44]:
df[df['program_id'] == 4]['name'].unique()

array(['Apollo'], dtype=object)

In [45]:
df[df['lesson'].isna()]

Unnamed: 0_level_0,lesson,user_id,name,ip,start_date,end_date,program_id
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-04-08 09:25:18,,586,Curie,72.177.240.51,2020-02-03,2020-07-07,3


In [64]:
df['start_date'] = pd.to_datetime(df['start_date'])
df['end_date'] = pd.to_datetime(df['end_date'])
df['access_day'] = df.index.day_name()
df['access_month'] = df.index.month
df['access_year'] = df.index.year

In [65]:
df

Unnamed: 0_level_0,lesson,user_id,name,ip,start_date,end_date,program_id,access_year,access_day,access_month
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-01-26 09:55:03,/,1,Hampton,97.105.19.61,2015-09-22,2016-02-06,1,2018,Friday,1
2018-01-26 09:56:02,java-ii,1,Hampton,97.105.19.61,2015-09-22,2016-02-06,1,2018,Friday,1
2018-01-26 09:56:05,java-ii/object-oriented-programming,1,Hampton,97.105.19.61,2015-09-22,2016-02-06,1,2018,Friday,1
2018-01-26 09:56:06,slides/object_oriented_programming,1,Hampton,97.105.19.61,2015-09-22,2016-02-06,1,2018,Friday,1
2018-01-26 09:56:24,javascript-i/conditionals,2,Teddy,97.105.19.61,2018-01-08,2018-05-17,2,2018,Friday,1
...,...,...,...,...,...,...,...,...,...,...
2021-04-21 16:41:51,jquery/personal-site,64,Staff,71.150.217.33,2014-02-04,2014-02-04,2,2021,Wednesday,4
2021-04-21 16:42:02,jquery/mapbox-api,64,Staff,71.150.217.33,2014-02-04,2014-02-04,2,2021,Wednesday,4
2021-04-21 16:42:09,jquery/ajax/weather-map,64,Staff,71.150.217.33,2014-02-04,2014-02-04,2,2021,Wednesday,4
2021-04-21 16:44:37,anomaly-detection/discrete-probabilistic-methods,744,Staff,24.160.137.86,2014-02-04,2014-02-04,2,2021,Wednesday,4


In [48]:
df.groupby('user_id')['ip'].count().sort_values()

user_id
652        1
649        1
952        1
165        1
879        1
       ...  
1       7404
314     7783
53     12329
64     16322
11     17913
Name: ip, Length: 911, dtype: int64

In [49]:
df.groupby('ip')['name'].count().sort_values()

ip
99.203.154.95           1
107.77.221.14           1
172.58.99.57            1
187.237.231.85          1
187.237.231.72          1
                    ...  
76.185.145.231       4754
71.150.217.33        6791
192.171.117.210      9124
97.105.19.61        60530
97.105.19.58       268648
Name: name, Length: 5200, dtype: int64

In [50]:
df.groupby('name')['ip'].count().sort_values()

name
Everglades        1
Denali            4
Apollo            5
Franklin         72
Badlands         93
Ike             253
Joshua          302
Glacier         598
Mammoth         691
Niagara         755
Quincy         1237
Oberon         1672
Hampton        1712
Pinnacles      2158
Kings          2845
Olympic        4954
Neptune        7276
Sequoia        7444
Florence       8562
Arches         8890
Lassen         9587
Easley        14715
Marco         16397
Luna          16623
Bash          17713
Yosemite      20743
Curie         21582
Kalypso       23691
Andromeda     25359
Wrangell      25586
Bayes         26538
Xanadu        27749
Europa        28033
Ulysses       28534
Betelgeuse    29356
Hyperion      29855
Teddy         30926
Darden        32015
Deimos        32888
Apex          33568
Ganymede      33844
Voyageurs     35636
Fortuna       36902
Jupiter       37109
Zion          38096
Ceres         40730
Staff         84031
Name: ip, dtype: int64

1. Which lesson appears to attract the most traffic consistently across cohorts (per program)?


2. Is there a cohort that referred to a lesson significantly more than other cohorts seemed to gloss over?
- Home page is most common

In [55]:
df.groupby(['name','lesson'])['ip'].count().sort_values()

name       lesson                              
Zion       web-design/ux/purpose                      1
Niagara    java-ii/object-oriented-programming        1
           java-ii/inheritance-and-polymorphism       1
Ceres      register.                                  1
           slides/collections                         1
                                                   ... 
Bayes      /                                       1967
Fortuna    /                                       2038
Voyageurs  /                                       2101
Darden     /                                       2980
Staff      /                                       6340
Name: ip, Length: 13565, dtype: int64

3. Are there students who, when active, hardly access the curriculum? If so, what information do you have about these students?


In [70]:
active_logs = df[(df.index < df['end_date']) &
                 (df['name'] != 'Staff')]

In [71]:
active_logs

Unnamed: 0_level_0,lesson,user_id,name,ip,start_date,end_date,program_id,access_year,access_day,access_month
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-01-26 09:56:24,javascript-i/conditionals,2,Teddy,97.105.19.61,2018-01-08,2018-05-17,2,2018,Friday,1
2018-01-26 09:56:41,javascript-i/loops,2,Teddy,97.105.19.61,2018-01-08,2018-05-17,2,2018,Friday,1
2018-01-26 09:56:46,javascript-i/conditionals,3,Teddy,97.105.19.61,2018-01-08,2018-05-17,2,2018,Friday,1
2018-01-26 09:56:48,javascript-i/functions,3,Teddy,97.105.19.61,2018-01-08,2018-05-17,2,2018,Friday,1
2018-01-26 09:56:59,javascript-i/loops,2,Teddy,97.105.19.61,2018-01-08,2018-05-17,2,2018,Friday,1
...,...,...,...,...,...,...,...,...,...,...
2021-04-21 16:36:09,jquery/personal-site,869,Marco,136.50.98.51,2021-01-25,2021-07-19,2,2021,Wednesday,4
2021-04-21 16:36:34,html-css/css-ii/bootstrap-grid-system,948,Neptune,104.48.214.211,2021-03-15,2021-09-03,2,2021,Wednesday,4
2021-04-21 16:37:48,java-iii,834,Luna,67.11.50.23,2020-12-07,2021-06-08,2,2021,Wednesday,4
2021-04-21 16:38:14,java-iii/servlets,834,Luna,67.11.50.23,2020-12-07,2021-06-08,2,2021,Wednesday,4


In [73]:
df.groupby('user_id')['ip'].count().sort_values()

user_id
652        1
649        1
952        1
165        1
879        1
       ...  
1       7404
314     7783
53     12329
64     16322
11     17913
Name: ip, Length: 911, dtype: int64

4. Is there any suspicious activity, such as users/machines/etc accessing the curriculum who shouldn’t be? Does it appear that any web-scraping is happening? Are there any suspicious IP addresses?


In [78]:
df[df.index.duplicated()]

Unnamed: 0_level_0,lesson,user_id,name,ip,start_date,end_date,program_id,access_year,access_day,access_month
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-01-26 10:41:22,javascript-i/conditionals,19,Teddy,97.105.19.61,2018-01-08,2018-05-17,2,2018,Friday,1
2018-01-26 11:36:38,javascript-i/functions,26,Pinnacles,97.105.19.61,2017-03-27,2017-07-20,2,2018,Friday,1
2018-01-26 12:11:15,javascript-i/loops,30,Teddy,97.105.19.61,2018-01-08,2018-05-17,2,2018,Friday,1
2018-01-26 13:02:29,appendix,17,Sequoia,12.189.101.90,2017-09-27,2018-02-15,2,2018,Friday,1
2018-01-26 13:02:31,appendix/capstone-workbook,17,Sequoia,12.189.101.90,2017-09-27,2018-02-15,2,2018,Friday,1
...,...,...,...,...,...,...,...,...,...,...
2021-04-21 15:53:08,java-i,890,Marco,70.239.184.149,2021-01-25,2021-07-19,2,2021,Wednesday,4
2021-04-21 16:04:04,java-ii,64,Staff,71.150.217.33,2014-02-04,2014-02-04,2,2021,Wednesday,4
2021-04-21 16:04:04,mysql,821,Luna,136.50.16.223,2020-12-07,2021-06-08,2,2021,Wednesday,4
2021-04-21 16:05:32,java-i/console-io,64,Staff,71.150.217.33,2014-02-04,2014-02-04,2,2021,Wednesday,4


5. At some point in 2019, the ability for students and alumni to access both curriculums (web dev to ds, ds to web dev) should have been shut off. Do you see any evidence of that happening? Did it happen before?


6. What topics are grads continuing to reference after graduation and into their jobs (for each program)?


In [66]:
alumni_logs = df[(df.index > df['end_date']) &
                 (df['name'] != 'Staff')]

In [79]:
alumni_logs['lesson'].value_counts()

/                              9228
javascript-i                   3152
spring                         2870
search/search_index.json       2834
html-css                       2403
                               ... 
10.04.04_TextClassification       1
10.04.05_TopicModeling            1
10.10_Exercises                   1
11.00_Intro.md                    1
7.04.01_Partitioning              1
Name: lesson, Length: 1377, dtype: int64

In [80]:
alumni_logs[alumni_logs['lesson'].str.startswith('javascript-i')]['lesson'].value_counts()


javascript-i                                                                 3152
javascript-ii                                                                1988
javascript-i/functions                                                        785
javascript-i/javascript-with-html                                             775
javascript-i/loops                                                            644
javascript-ii/promises                                                        625
javascript-i/conditionals                                                     613
javascript-ii/es6                                                             611
javascript-i/introduction/working-with-data-types-operators-and-variables     577
javascript-i/bom-and-dom/dom                                                  575
javascript-ii/ajax-api                                                        575
javascript-ii/npm                                                             568
javascript-ii/ma

In [81]:
alumni_logs[alumni_logs['lesson'].str.startswith('spring')]['lesson'].value_counts()


spring                                         2870
spring/fundamentals/controllers                1299
spring/setup                                   1236
spring/fundamentals/views                      1166
spring/fundamentals/repositories               1073
spring/fundamentals/form-model-binding          737
spring/fundamentals/relationships               709
spring/fundamentals/security/authentication     698
spring/fundamentals/services                    631
spring/fundamentals/security                    391
spring/extra-features/json-response             285
spring/extra-features/form-validation           238
spring/extra-features/file-upload               221
spring/extra-features/error-pages               157
spring/extra-features                           129
spring/fundamentals/integration-tests           119
spring/fundamentals                               3
spring/security                                   1
Name: lesson, dtype: int64

In [82]:
alumni_logs[alumni_logs['lesson'].str.startswith('html-css')]['lesson'].value_counts()


html-css                                       2403
html-css/elements                               937
html-css/introduction                           800
html-css/forms                                  671
html-css/css-i/selectors-and-properties         466
html-css/css-ii/bootstrap-grid-system           443
html-css/css-ii/bootstrap-introduction          440
html-css/css-i/box-model                        430
html-css/css-i/positioning                      399
html-css/css-ii/media-queries                   398
html-css/css-i/introduction                     368
html-css/css-i/flexbox/flexbox-fundamentals     274
html-css/css-i                                  234
html-css/css-i/flexbox/flexbox-in-practice      188
html-css/css-ii/grids                           175
html-css/css-ii                                 164
html-css/css-i/grid/css-grid-fundamentals       146
html-css/css-i/grid/css-grid-intro              106
html-css/css-i/grid/css-grid-in-practice         87
html-css/css

In [None]:
alumni_logs[alumni_logs['lesson'].str.startswith('html-css')]['lesson'].value_counts()


7. Which lessons are least accessed?


8. Anything else I should be aware of?
