# [Project - Anomaly Detection](https://classroom.google.com/u/1/c/NTI2MzQ4Nzc3NTk3/a/NTI2MzQ4Nzc3NjUx/details)

In [1]:
#################################################
#################### Imports ####################
#################################################

# ---------------- #
# Common Libraries #
# ---------------- #
      
# Standard Imports
import os
import requests
import numpy as np
import pandas as pd

# Working with Dates & Times
from sklearn.model_selection import TimeSeriesSplit
from datetime import timedelta, datetime

# Working with Math & Stats
import statsmodels.api as sm
import scipy.stats as stats

# to evaluate performance using rmse
from sklearn.metrics import mean_squared_error
from math import sqrt 

# holt's linear trend model. 
from statsmodels.tsa.api import Holt

# Plots, Graphs, & Visualization
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter
from matplotlib.dates import DateFormatter

# plotting defaults
plt.rc('figure', figsize=(13, 7))
plt.style.use('seaborn-whitegrid')
plt.rc('font', size=16)

# -------------- #
# Action Imports #
# -------------- #

# Warnings 
import warnings 
warnings.filterwarnings("ignore")

# ------------ #
# JUPYTER ONLY #
# ------------ #
    
# Disable autosave
%autosave 0

# Increases Display Resolution for Graphs 
%matplotlib inline 
%config InlineBackend.figure_format = 'retina'

# Left Align Tables in Jupyter Notebook
from IPython.core.display import HTML
table_css = 'table {align:left;display:block}'
HTML('{}'.format(table_css))

# ------------- #
# Local Imports #
# ------------- #

# importing sys
import sys

# adding 00_helper_files to the system path
sys.path.insert(0, '/Users/qmcbt/codeup-data-science/00_helper_files')

# env containing sensitive access credentials
import env
from env import user, password, host

# Import Helper Modules
import QMCBT_00_quicktips as qt
import QMCBT_01_acquire as acq
import QMCBT_02_prepare as prep
import QMCBT_03_explore as exp
import QMCBT_04_visualize as viz
import QMCBT_05_model as mod
import QMCBT_wrangle as qw
import wrangle as w

Autosave disabled


# SYLABUS
* To get 100 on this project you only need to answer 5 out of the 7 questions (along with the other deliverables listed below i.e. slide, your notebook, etc).
* send your email before the due date and time to datascience@codeup.com (Only one team member can do this on behalf of whole team).
* Submit a link to a final notebook on GitHub that asks and answers questions - document the work you do to justify findings
* Compose an email with the answers to the questions/your findings, and in the email, include the link to your notebook in GitHub and attach your slide.
* You will not present this, so be sure that the details you need your leader to convey/understand are clearly communicated in the email.
* Your slide should be like an executive summary and be in form to present.
* Continue to use best practices of acquire.py, prepare.py, etc.
* Since there is no modeling to be done for this project, there is no need to split the data into train/validate/test
* The cohort schedule is in the SQL database, and alumni.codeup.com has info as well.
* Teamwork with Git handout is posted in the google classroom

I have some questions for you that I need to be answered before the board meeting Wednesday afternoon. I need to be able to speak to the following questions. I also need a single slide that I can incorporate into my existing presentation (Google Slides) that summarizes the most important points. My questions are listed below; however, if you discover anything else important that I didn’t think to ask, please include that as well.

# Download the data
url = 'https://drive.google.com/u/1/uc?id=1phD962Wrt8fetpvX-ersybPcZW3_54ma&export=download'

In [4]:
df = w.acquire_anonymized_curriculum_access_data()

In [8]:
df = w.clean_the_data(df)

In [9]:
df.T

date,2018-01-26,2018-01-26.1,2018-01-26.2,2018-01-26.3,2018-01-26.4,2018-01-26.5,2018-01-26.6,2018-01-26.7,2018-01-26.8,2018-01-26.9,...,2021-04-21,2021-04-21.1,2021-04-21.2,2021-04-21.3,2021-04-21.4,2021-04-21.5,2021-04-21.6,2021-04-21.7,2021-04-21.8,2021-04-21.9
date,2018-01-26 00:00:00,2018-01-26 00:00:00,2018-01-26 00:00:00,2018-01-26 00:00:00,2018-01-26 00:00:00,2018-01-26 00:00:00,2018-01-26 00:00:00,2018-01-26 00:00:00,2018-01-26 00:00:00,2018-01-26 00:00:00,...,2021-04-21 00:00:00,2021-04-21 00:00:00,2021-04-21 00:00:00,2021-04-21 00:00:00,2021-04-21 00:00:00,2021-04-21 00:00:00,2021-04-21 00:00:00,2021-04-21 00:00:00,2021-04-21 00:00:00,2021-04-21 00:00:00
time,09:55:03,09:56:02,09:56:05,09:56:06,09:56:24,09:56:41,09:56:46,09:56:48,09:56:59,09:58:26,...,16:38:14,16:41:29,16:41:31,16:41:49,16:41:51,16:41:51,16:42:02,16:42:09,16:44:37,16:44:39
page,/,java-ii,java-ii/object-oriented-programming,slides/object_oriented_programming,javascript-i/conditionals,javascript-i/loops,javascript-i/conditionals,javascript-i/functions,javascript-i/loops,javascript-i/functions,...,java-iii/servlets,javascript-i,javascript-ii,jquery,javascript-i/bom-and-dom/dom,jquery/personal-site,jquery/mapbox-api,jquery/ajax/weather-map,anomaly-detection/discrete-probabilistic-methods,jquery/mapbox-api
user_id,1,1,1,1,2,2,3,3,2,4,...,834,64,64,64,875,64,64,64,744,64
cohort_id,8.0,8.0,8.0,8.0,22.0,22.0,22.0,22.0,22.0,22.0,...,134.0,28.0,28.0,28.0,135.0,28.0,28.0,28.0,28.0,28.0
ip,97.105.19.61,97.105.19.61,97.105.19.61,97.105.19.61,97.105.19.61,97.105.19.61,97.105.19.61,97.105.19.61,97.105.19.61,97.105.19.61,...,67.11.50.23,71.150.217.33,71.150.217.33,71.150.217.33,24.242.150.231,71.150.217.33,71.150.217.33,71.150.217.33,24.160.137.86,71.150.217.33
name,Hampton,Hampton,Hampton,Hampton,Teddy,Teddy,Teddy,Teddy,Teddy,Teddy,...,Luna,Staff,Staff,Staff,Marco,Staff,Staff,Staff,Staff,Staff
start_date,2015-09-22 00:00:00,2015-09-22 00:00:00,2015-09-22 00:00:00,2015-09-22 00:00:00,2018-01-08 00:00:00,2018-01-08 00:00:00,2018-01-08 00:00:00,2018-01-08 00:00:00,2018-01-08 00:00:00,2018-01-08 00:00:00,...,2020-12-07 00:00:00,2014-02-04 00:00:00,2014-02-04 00:00:00,2014-02-04 00:00:00,2021-01-25 00:00:00,2014-02-04 00:00:00,2014-02-04 00:00:00,2014-02-04 00:00:00,2014-02-04 00:00:00,2014-02-04 00:00:00
end_date,2016-02-06 00:00:00,2016-02-06 00:00:00,2016-02-06 00:00:00,2016-02-06 00:00:00,2018-05-17 00:00:00,2018-05-17 00:00:00,2018-05-17 00:00:00,2018-05-17 00:00:00,2018-05-17 00:00:00,2018-05-17 00:00:00,...,2021-06-08 00:00:00,2014-02-04 00:00:00,2014-02-04 00:00:00,2014-02-04 00:00:00,2021-07-19 00:00:00,2014-02-04 00:00:00,2014-02-04 00:00:00,2014-02-04 00:00:00,2014-02-04 00:00:00,2014-02-04 00:00:00
program_id,PHP,PHP,PHP,PHP,Java,Java,Java,Java,Java,Java,...,Java,Java,Java,Java,Java,Java,Java,Java,Java,Java


## 1. Which lesson appears to attract the most traffic consistently across cohorts (per program)?

## 2. Is there a cohort that referred to a lesson significantly more than other cohorts seemed to gloss over?

## 3. Are there students who, when active, hardly access the curriculum? If so, what information do you have about these students?

## 4. Is there any suspicious activity, such as users/machines/etc accessing the curriculum who shouldn’t be? Does it appear that any web-scraping is happening? Are there any suspicious IP addresses?

## 5. At some point in 2019, the ability for students and alumni to access both curriculums (web dev to ds, ds to web dev) should have been shut off. Do you see any evidence of that happening? Did it happen before?

## 6. What topics are grads continuing to reference after graduation and into their jobs (for each program)?

# Grad Users

In [11]:
# Create DataFrame for users that have accessed the curriculum past there graduation date
grad_users = df[df.date > df.end_date]

In [12]:
# Count the users
grad_users.user_id.nunique()

592

In [13]:
# Show the users
grad_users.user_id.unique()

array([  1,  11,  21,  26,  37,  40,  50,  51,  53,  60,  64,  66,  67,
        68,  69,  70,  71,  72,  75,  76,  77,  80,  81,  82,  83,  84,
        85,  87,  90,  91,  92,  93,  94,  95,  96,  97,  98, 101, 102,
        16,  42,  41,  17,  22,  15, 104,  47,  57,  10, 105, 106,  28,
       107,  65,  52,  33,  56, 108,  27,  38,  36, 109, 110, 112, 113,
       114, 115, 139, 140, 141, 143, 144, 146, 147, 148, 149, 150, 151,
       152, 153, 156, 161, 162, 163, 164, 165, 167, 168, 169, 170, 172,
       173, 174, 175, 177, 178, 179,  29,  19, 180,   6,  99,  43,  31,
         7, 181, 138,  35, 142,   5, 155, 154,  88, 209, 210,  18,   3,
        20, 118, 211, 212, 214, 216, 217, 131, 135, 128, 127, 124, 130,
       159,  12, 145, 241, 242, 136, 157, 171, 123,  25, 134, 119, 243,
       244, 245, 246, 160, 248, 252, 254, 255, 120, 257, 279, 280, 281,
        14, 253, 188, 186, 283, 184, 215, 278, 183, 204, 193, 195, 197,
       249, 206, 284, 121, 201, 285, 286, 311, 190, 312, 116, 20

In [14]:
# Count how many times each user accessed the curriculum
grad_users.user_id.value_counts()

11     17913
64     16297
53     12329
314     7783
1       7404
       ...  
325        1
593        1
533        1
592        1
163        1
Name: user_id, Length: 592, dtype: int64

In [15]:
# Count how many times each page was accessed
grad_users.page.value_counts()

/                                                               15524
javascript-i                                                     4965
spring                                                           4262
search/search_index.json                                         4174
html-css                                                         3678
                                                                ...  
users/1/edit                                                        1
9_Appendix_TSAD_Lesson2                                             1
Dataset_Challenge                                                   1
2.0_Intro_Stats                                                     1
appendix/professional-development/post-interview-review-form        1
Name: page, Length: 1865, dtype: int64

# Data Science

In [16]:
# Create DataFrame for Data Science users that have accessed the curriculum past there graduation date
ds_grad_users = grad_users[grad_users.program_id == 'Data Science']

In [17]:
ds_grad_users.T

date,2020-01-31,2020-01-31.1,2020-01-31.2,2020-01-31.3,2020-01-31.4,2020-01-31.5,2020-01-31.6,2020-01-31.7,2020-01-31.8,2020-01-31.9,...,2021-04-21,2021-04-21.1,2021-04-21.2,2021-04-21.3,2021-04-21.4,2021-04-21.5,2021-04-21.6,2021-04-21.7,2021-04-21.8,2021-04-21.9
date,2020-01-31 00:00:00,2020-01-31 00:00:00,2020-01-31 00:00:00,2020-01-31 00:00:00,2020-01-31 00:00:00,2020-01-31 00:00:00,2020-01-31 00:00:00,2020-01-31 00:00:00,2020-01-31 00:00:00,2020-01-31 00:00:00,...,2021-04-21 00:00:00,2021-04-21 00:00:00,2021-04-21 00:00:00,2021-04-21 00:00:00,2021-04-21 00:00:00,2021-04-21 00:00:00,2021-04-21 00:00:00,2021-04-21 00:00:00,2021-04-21 00:00:00,2021-04-21 00:00:00
time,11:05:04,11:05:13,11:05:13,11:05:13,14:44:59,14:47:15,14:47:20,14:47:26,14:47:27,14:47:28,...,11:00:06,11:44:36,13:16:43,13:17:05,13:17:07,15:20:12,15:20:12,15:20:12,15:20:14,15:20:18
page,/,1-fundamentals/1.1-intro-to-data-science,1-fundamentals/modern-data-scientist.jpg,1-fundamentals/AI-ML-DL-timeline.jpg,/,search/search_index.json,4-python/7.4.4-advanced-dataframes,3-sql/1-mysql-overview,3-sql/2-mysql-introduction,3-sql/3-databases,...,python/advanced-dataframes,python/control-structures,/,python/control-structures,python/data-types-and-variables,classification/overview,classification/classical_programming_vs_machin...,classification/scale_features_or_not.svg,classification/project,classification/acquire
user_id,476,476,476,476,476,476,476,476,476,476,...,485,692,692,692,692,692,692,692,692,692
cohort_id,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0,...,34.0,59.0,59.0,59.0,59.0,59.0,59.0,59.0,59.0,59.0
ip,136.50.49.145,136.50.49.145,136.50.49.145,136.50.49.145,136.50.49.145,136.50.49.145,136.50.49.145,136.50.49.145,136.50.49.145,136.50.49.145,...,173.174.142.84,96.8.130.134,96.8.130.134,96.8.130.134,96.8.130.134,96.8.130.134,96.8.130.134,96.8.130.134,96.8.130.134,96.8.130.134
name,Bayes,Bayes,Bayes,Bayes,Bayes,Bayes,Bayes,Bayes,Bayes,Bayes,...,Bayes,Darden,Darden,Darden,Darden,Darden,Darden,Darden,Darden,Darden
start_date,2019-08-19 00:00:00,2019-08-19 00:00:00,2019-08-19 00:00:00,2019-08-19 00:00:00,2019-08-19 00:00:00,2019-08-19 00:00:00,2019-08-19 00:00:00,2019-08-19 00:00:00,2019-08-19 00:00:00,2019-08-19 00:00:00,...,2019-08-19 00:00:00,2020-07-13 00:00:00,2020-07-13 00:00:00,2020-07-13 00:00:00,2020-07-13 00:00:00,2020-07-13 00:00:00,2020-07-13 00:00:00,2020-07-13 00:00:00,2020-07-13 00:00:00,2020-07-13 00:00:00
end_date,2020-01-30 00:00:00,2020-01-30 00:00:00,2020-01-30 00:00:00,2020-01-30 00:00:00,2020-01-30 00:00:00,2020-01-30 00:00:00,2020-01-30 00:00:00,2020-01-30 00:00:00,2020-01-30 00:00:00,2020-01-30 00:00:00,...,2020-01-30 00:00:00,2021-01-12 00:00:00,2021-01-12 00:00:00,2021-01-12 00:00:00,2021-01-12 00:00:00,2021-01-12 00:00:00,2021-01-12 00:00:00,2021-01-12 00:00:00,2021-01-12 00:00:00,2021-01-12 00:00:00
program_id,Data Science,Data Science,Data Science,Data Science,Data Science,Data Science,Data Science,Data Science,Data Science,Data Science,...,Data Science,Data Science,Data Science,Data Science,Data Science,Data Science,Data Science,Data Science,Data Science,Data Science


In [19]:
# Count how many times each page was accessed
ds_grad_users.page.value_counts().head(20)

/                                                                1436
search/search_index.json                                          493
sql/mysql-overview                                                275
classification/overview                                           266
classification/scale_features_or_not.svg                          219
anomaly-detection/AnomalyDetectionCartoon.jpeg                    193
anomaly-detection/overview                                        191
fundamentals/AI-ML-DL-timeline.jpg                                189
fundamentals/modern-data-scientist.jpg                            187
fundamentals/intro-to-data-science                                184
1-fundamentals/modern-data-scientist.jpg                          127
1-fundamentals/AI-ML-DL-timeline.jpg                              126
1-fundamentals/1.1-intro-to-data-science                          126
6-regression/1-overview                                            86
sql/database-design 

## 7. Which lessons are least accessed?

## 8. Anything else I should be aware of?