# Anomaly detection project

In [1]:
# Ignoring warning messages from python
import warnings
warnings.filterwarnings('ignore')

# General use imports
import pandas as pd
import numpy as np


# Visualization imports
import matplotlib.pyplot as plt
import seaborn as sns

# Acquisition and preparation imports

# Classification performance measuring imports
from sklearn import metrics

### *I. Checking out cohorts*

In [2]:
cohorts = pd.read_csv('cohorts_data.csv')
cohorts.head()

Unnamed: 0.1,Unnamed: 0,id,name,slack,start_date,end_date,created_at,updated_at,deleted_at,program_id
0,0,1,Arches,#arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
1,1,2,Badlands,#badlands,2014-06-04,2014-08-22,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
2,2,3,Carlsbad,#carlsbad,2014-09-04,2014-11-05,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
3,3,4,Denali,#denali,2014-10-20,2015-01-18,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
4,4,5,Everglades,#everglades,2014-11-18,2015-02-24,2016-06-14 19:52:26,2016-06-14 19:52:26,,1


In [3]:
cohorts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53 entries, 0 to 52
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  53 non-null     int64  
 1   id          53 non-null     int64  
 2   name        53 non-null     object 
 3   slack       53 non-null     object 
 4   start_date  53 non-null     object 
 5   end_date    53 non-null     object 
 6   created_at  53 non-null     object 
 7   updated_at  53 non-null     object 
 8   deleted_at  0 non-null      float64
 9   program_id  53 non-null     int64  
dtypes: float64(1), int64(3), object(6)
memory usage: 4.3+ KB


In [4]:
cohorts.program_id.nunique()

4

In [5]:
cohorts.id.nunique()

53

In [6]:
cohorts.id.unique()

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  51,  52,  53,  55,  56,
        57,  58,  59,  60,  61,  62, 132, 133, 134, 135, 136, 137, 138,
       139])

### *II. Checking out logs*

In [7]:
logs = pd.read_csv('logs_data.csv')
logs.head()

Unnamed: 0.1,Unnamed: 0,date,time,path,user_id,cohort_id,ip
0,0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61
1,1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61
2,2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61
3,3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61
4,4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61


In [8]:
logs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900223 entries, 0 to 900222
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Unnamed: 0  900223 non-null  int64  
 1   date        900223 non-null  object 
 2   time        900223 non-null  object 
 3   path        900222 non-null  object 
 4   user_id     900223 non-null  int64  
 5   cohort_id   847330 non-null  float64
 6   ip          900223 non-null  object 
dtypes: float64(1), int64(2), object(4)
memory usage: 48.1+ MB


In [9]:
logs.cohort_id.nunique()

47

In [10]:
logs.cohort_id.unique()

array([  8.,  22.,  21.,   1.,  16.,  18.,  19.,  nan,  13.,  14.,  15.,
         7.,   4.,  12.,  17.,  23.,   2.,   9.,  11.,  24.,  25.,  26.,
         6.,  27.,  28.,  29.,  31.,  32.,  33.,  34.,  51.,  52.,  53.,
        55.,  56.,  57.,   5.,  58.,  59.,  61.,  62., 132., 134., 133.,
       135., 138., 137., 139.])

In [11]:
logs.user_id.nunique()

981

In [12]:
logs.path.nunique()

2313

### *III. Creating a unique dataframe*

In [13]:
cohorts.head()

Unnamed: 0.1,Unnamed: 0,id,name,slack,start_date,end_date,created_at,updated_at,deleted_at,program_id
0,0,1,Arches,#arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
1,1,2,Badlands,#badlands,2014-06-04,2014-08-22,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
2,2,3,Carlsbad,#carlsbad,2014-09-04,2014-11-05,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
3,3,4,Denali,#denali,2014-10-20,2015-01-18,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
4,4,5,Everglades,#everglades,2014-11-18,2015-02-24,2016-06-14 19:52:26,2016-06-14 19:52:26,,1


In [14]:
cohorts = cohorts.drop(columns = ['Unnamed: 0', 'slack', 'created_at', 'updated_at', 'deleted_at', 'program_id'])
cohorts.head()

Unnamed: 0,id,name,start_date,end_date
0,1,Arches,2014-02-04,2014-04-22
1,2,Badlands,2014-06-04,2014-08-22
2,3,Carlsbad,2014-09-04,2014-11-05
3,4,Denali,2014-10-20,2015-01-18
4,5,Everglades,2014-11-18,2015-02-24


In [15]:
logs.head()

Unnamed: 0.1,Unnamed: 0,date,time,path,user_id,cohort_id,ip
0,0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61
1,1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61
2,2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61
3,3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61
4,4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61


In [16]:
combined_df = pd.merge(logs, cohorts, how = 'outer', left_on = 'cohort_id', right_on= 'id')
combined_df.head()

Unnamed: 0.1,Unnamed: 0,date,time,path,user_id,cohort_id,ip,id,name,start_date,end_date
0,0.0,2018-01-26,09:55:03,/,1.0,8.0,97.105.19.61,8.0,Hampton,2015-09-22,2016-02-06
1,1.0,2018-01-26,09:56:02,java-ii,1.0,8.0,97.105.19.61,8.0,Hampton,2015-09-22,2016-02-06
2,2.0,2018-01-26,09:56:05,java-ii/object-oriented-programming,1.0,8.0,97.105.19.61,8.0,Hampton,2015-09-22,2016-02-06
3,3.0,2018-01-26,09:56:06,slides/object_oriented_programming,1.0,8.0,97.105.19.61,8.0,Hampton,2015-09-22,2016-02-06
4,58.0,2018-01-26,10:40:15,javascript-i/functions,1.0,8.0,97.105.19.61,8.0,Hampton,2015-09-22,2016-02-06


In [None]:
def make_datetime_index(df):
    df['datetime'] = pd.to_datetime(df.date + ' ' + df.time)
    df = df.set_index('datetime').sort_index()
    df = df.drop(columns= ['Unnamed: 0', 'date', 'time'])
    return df

In [None]:
combined_df = make_datetime_index(logs)
combined_df.head()

## 2. Is there a cohort that referred to a lesson significantly more than other cohorts seemed to gloss over?