In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

# Questions:
> * 1. Which lesson appears to attract the most traffic consistently across cohorts (per program)?
> * 2. Is there a cohort that referred to a lesson significantly more than other cohorts seemed to gloss over?
> * 3. Are there students who, when active, hardly access the curriculum? If so, what information do you have about these students?
> * 4. Is there any suspicious activity, such as users/machines/etc accessing the curriculum who shouldn’t be? Does it appear that any web-scraping is happening? Are there any suspicious IP addresses?
> * 5. At some point in 2019, the ability for students and alumni to access both curriculums (web dev to ds, ds to web dev) should have been shut off. Do you see any evidence of that happening? Did it happen before?
> * 6. What topics are grads continuing to reference after graduation and into their jobs (for each program)?
> * 7. Which lessons are least accessed?
> * 8. Anything else I should be aware of?


In [3]:
cohorts = pd.read_csv('cohorts_data.csv')
logs = pd.read_csv('logs_data.csv')

for col in cohorts.columns:
    if 'Unnamed' in col:
        cohorts = cohorts.drop(columns=[col])

for col in logs.columns:
    if 'Unnamed' in col:
        logs = logs.drop(columns=[col])

cohorts.head()

Unnamed: 0,id,name,slack,start_date,end_date,created_at,updated_at,deleted_at,program_id
0,1,Arches,#arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
1,2,Badlands,#badlands,2014-06-04,2014-08-22,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
2,3,Carlsbad,#carlsbad,2014-09-04,2014-11-05,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
3,4,Denali,#denali,2014-10-20,2015-01-18,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
4,5,Everglades,#everglades,2014-11-18,2015-02-24,2016-06-14 19:52:26,2016-06-14 19:52:26,,1


In [4]:
cohorts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53 entries, 0 to 52
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          53 non-null     int64  
 1   name        53 non-null     object 
 2   slack       53 non-null     object 
 3   start_date  53 non-null     object 
 4   end_date    53 non-null     object 
 5   created_at  53 non-null     object 
 6   updated_at  53 non-null     object 
 7   deleted_at  0 non-null      float64
 8   program_id  53 non-null     int64  
dtypes: float64(1), int64(2), object(6)
memory usage: 3.9+ KB


In [5]:
cohorts= cohorts.drop(columns=['deleted_at'])
cohorts.head(3)

Unnamed: 0,id,name,slack,start_date,end_date,created_at,updated_at,program_id
0,1,Arches,#arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,1
1,2,Badlands,#badlands,2014-06-04,2014-08-22,2016-06-14 19:52:26,2016-06-14 19:52:26,1
2,3,Carlsbad,#carlsbad,2014-09-04,2014-11-05,2016-06-14 19:52:26,2016-06-14 19:52:26,1


In [6]:
cohorts.tail(4)

Unnamed: 0,id,name,slack,start_date,end_date,created_at,updated_at,program_id
49,136,Placeholder for students in transition,#null,2021-03-03,2029-03-01,2021-03-03 21:50:49,2021-03-03 21:52:56,2
50,137,Florence,#florence,2021-03-15,2021-09-03,2021-03-15 18:18:20,2021-03-15 18:18:20,3
51,138,Neptune,#neptune,2021-03-15,2021-09-03,2021-03-15 19:57:09,2021-03-15 19:57:09,2
52,139,Oberon,#oberon,2021-04-12,2021-10-01,2021-04-12 18:07:21,2021-04-12 18:07:21,2


In [7]:
logs.head()

Unnamed: 0,date,time,path,user_id,cohort_id,ip
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61


In [8]:
logs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900223 entries, 0 to 900222
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   date       900223 non-null  object 
 1   time       900223 non-null  object 
 2   path       900222 non-null  object 
 3   user_id    900223 non-null  int64  
 4   cohort_id  847330 non-null  float64
 5   ip         900223 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 41.2+ MB


In [9]:
logs.cohort_id.value_counts(dropna=False)

28.0     84031
NaN      52893
33.0     40730
29.0     38096
62.0     37109
53.0     36902
24.0     35636
57.0     33844
56.0     33568
51.0     32888
59.0     32015
22.0     30926
58.0     29855
32.0     29356
23.0     28534
52.0     28033
26.0     27749
34.0     26538
25.0     25586
31.0     25359
132.0    23691
55.0     21582
27.0     20743
61.0     17713
134.0    16623
135.0    16397
133.0    14715
14.0      9587
1.0       8890
137.0     8562
21.0      7444
138.0     7276
17.0      4954
13.0      2845
18.0      2158
8.0       1712
139.0     1672
19.0      1237
16.0       755
15.0       691
7.0        598
12.0       302
11.0       253
2.0         93
6.0         72
9.0          5
4.0          4
5.0          1
Name: cohort_id, dtype: int64

In [10]:
logs = logs.fillna(0)

In [11]:
df = pd.merge(left_on=logs.cohort_id, right_on=cohorts.id, left=logs, right=cohorts,how='outer').drop(columns='key_0')

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 900229 entries, 0 to 900228
Data columns (total 14 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   date        900223 non-null  object 
 1   time        900223 non-null  object 
 2   path        900223 non-null  object 
 3   user_id     900223 non-null  float64
 4   cohort_id   900223 non-null  float64
 5   ip          900223 non-null  object 
 6   id          847336 non-null  float64
 7   name        847336 non-null  object 
 8   slack       847336 non-null  object 
 9   start_date  847336 non-null  object 
 10  end_date    847336 non-null  object 
 11  created_at  847336 non-null  object 
 12  updated_at  847336 non-null  object 
 13  program_id  847336 non-null  float64
dtypes: float64(4), object(10)
memory usage: 103.0+ MB


In [13]:
df.head()

Unnamed: 0,date,time,path,user_id,cohort_id,ip,id,name,slack,start_date,end_date,created_at,updated_at,program_id
0,2018-01-26,09:55:03,/,1.0,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,1.0
1,2018-01-26,09:56:02,java-ii,1.0,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,1.0
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1.0,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,1.0
3,2018-01-26,09:56:06,slides/object_oriented_programming,1.0,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,1.0
4,2018-01-26,10:40:15,javascript-i/functions,1.0,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,1.0


### 7. Which lessons are least accessed?

> * General start and end of entire site
> * General start and end for each module/lesson
> * See if things haven't been accessed in a long time

MVP: 
Lessons


**NOTE:** At some point in 2019, the ability for students and alumni to access both curriculums (web dev to ds, ds to web dev) should have been shut off. Do you see any evidence of that happening? Did it happen before?

In [14]:
#Range of Dates for entire site:
df.date = pd.to_datetime(df.date+" " +df.time)
df[df.date.isnull()].shape[0]

6

> **note:** Only 6 datetime values that are non-functional

In [15]:
# name of the cohorts/placeholders with inaccurate date values
df.name[df.date.isnull()]

900223                                  Carlsbad
900224                                    Balboa
900225                                   Redwood
900226                                       Ada
900227                                        Io
900228    Placeholder for students in transition
Name: name, dtype: object

In [16]:
# confirming the number of occurrences of the above cohorts/ph
df.name.value_counts().tail(10)

Franklin                                  72
Apollo                                     5
Denali                                     4
Redwood                                    1
Io                                         1
Ada                                        1
Everglades                                 1
Balboa                                     1
Carlsbad                                   1
Placeholder for students in transition     1
Name: name, dtype: int64

In [17]:
df.date.min(), df.date.max()

(Timestamp('2018-01-26 09:55:03'), Timestamp('2021-04-21 16:44:39'))

**Conclusion**: Our data started on Jan 26th, 2018 and is current up to April 21, 2021

## Analyzing low traffic by program
> * program 1: web dev
> * program 2: web dev
> * program 3: data science

In [None]:
new = df.path.value_counts().to_frame()
sub = pd.merge(left=df, right = new, left_on=df.path, right_on=new.index)
sub = sub[sub.path_y == 1]

In [141]:
sub.head()

Unnamed: 0,date,time,path_x,user_id,cohort_id,ip,name,slack,start_date,end_date,created_at,updated_at,program_id,path_y
89882,2018-01-27 07:49:25,07:49:25,teams/13,1.0,8.0,72.177.226.58,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,1.0,1
126074,2018-01-30 11:41:30,11:41:30,asdf,1.0,8.0,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,1.0,1
352634,2018-03-01 21:07:05,21:07:05,prework/cli/07-editing-files-with-vim,114.0,8.0,173.173.119.75,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,1.0,1
400387,2018-03-11 14:39:16,14:39:16,java-ii/file-op,1.0,8.0,72.177.226.58,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,1.0,1
428068,2018-03-21 09:37:43,09:37:43,slides/exceptions,1.0,8.0,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,1.0,1


In [133]:
# all programs + NaN
sub.program_id.value_counts(dropna=False)

2.0    285
3.0     68
1.0     60
NaN     45
Name: program_id, dtype: int64

### Program 1

In [136]:
sub.date[sub.program_id==1].min(), sub.date[sub.program_id==1].max()

(Timestamp('2018-01-27 07:49:25'), Timestamp('2021-03-05 11:16:56'))

In [126]:
# program 1
sub.date.dt.year[sub.program_id == 1].value_counts()

2018    31
2020    15
2019     8
2021     6
Name: date, dtype: int64

### Program 2

In [137]:
sub.date[sub.program_id==2].min(), sub.date[sub.program_id==2].max()

(Timestamp('2018-01-26 13:39:02'), Timestamp('2021-04-15 16:28:50'))

In [130]:
# program 2
sub.date.dt.year[sub.program_id == 2].value_counts()

2019    134
2020     92
2021     32
2018     27
Name: date, dtype: int64

### Program 3 

In [138]:
sub.date[sub.program_id==3].min(), sub.date[sub.program_id==3].max()

(Timestamp('2019-08-30 12:08:28'), Timestamp('2021-04-11 09:17:02'))

In [128]:
# program 3
sub.date.dt.year[sub.program_id == 3].value_counts()

2020    28
2021    24
2019    16
Name: date, dtype: int64

### NaN

In [139]:
sub.date[sub.program_id.isna()].min(), sub.date[sub.program_id.isna()].max()

(Timestamp('2019-03-04 09:25:59'), Timestamp('2020-10-22 16:10:08'))

In [129]:
sub.date.dt.year[sub.program_id == 1].value_counts().sum() + sub.date.dt.year[sub.program_id == 2].value_counts().sum() + sub.date.dt.year[sub.program_id == 3].value_counts().sum()

413

# Analyzing low traffic by cohort

**Why less than 2021?** Because 2021 would include current/ongoing cohorts and it would make sense for some pages to have low traffic from those cohorts

In [151]:
cohort_sub = sub.name.value_counts(dropna=False).to_frame()
cohort_sub['first_date_access'] = sub.groupby('name').date.min()
cohort_sub['last_date_access'] = sub.groupby('name').date.max()
cohort_sub[(cohort_sub.first_date_access.dt.year < 2021) & (cohort_sub.last_date_access.dt.year >= 2021)].rename(columns={'name':'count'})


Unnamed: 0,count,first_date_access,last_date_access
Staff,111,2019-01-11 11:51:19,2021-04-15 16:28:50
Bayes,28,2019-08-30 12:08:28,2021-03-01 11:16:33
Lassen,15,2018-02-07 13:48:51,2021-03-05 11:16:56
Kings,11,2018-07-09 19:40:52,2021-02-13 05:12:44
Yosemite,11,2018-12-04 10:44:56,2021-03-04 09:33:32
Jupiter,10,2020-10-17 23:11:31,2021-04-06 18:56:06
Bash,9,2020-07-24 16:41:07,2021-01-07 13:54:04
Teddy,9,2018-03-28 15:38:59,2021-03-12 10:00:27
Kalypso,8,2020-11-03 10:12:58,2021-03-11 09:09:49
Luna,6,2020-12-10 10:08:01,2021-02-25 21:32:35


In [155]:
df[df.name=='Staff'].date.min()

Timestamp('2018-01-26 09:55:03')

In [95]:
num_paths = df.path.nunique()
num_paths

2314

In [20]:
bottom_400 = np.unique(df.path.value_counts().iloc[num_paths-400:,].values)
bottom_400

array([1])

In [96]:
accessed_once = df.path.unique()[1:][df.path.value_counts() == 1]
once = pd.Series(accessed_once[:-1])
        
mask = []
added_True = False
for path in df.path.values:
    
    added_True = False
    for uno in once.values:
        if uno == path:
            mask.append(True)
            added_True = True
        if added_True:
            break
    if not added_True:
        mask.append(False)
            
            
df[mask]


Unnamed: 0,date,time,path,user_id,cohort_id,ip,id,name,slack,start_date,end_date,created_at,updated_at,program_id,accessed_once
311830,2020-08-05 00:52:40,00:52:40,storytelling/creating-custom-fields,11.0,28.0,76.185.145.231,28.0,Staff,#,2014-02-04,2014-02-04,2018-12-06 17:04:19,2018-12-06 17:04:19,2.0,True
311831,2020-08-05 00:52:41,00:52:41,storytelling/creating-dashboards,11.0,28.0,76.185.145.231,28.0,Staff,#,2014-02-04,2014-02-04,2018-12-06 17:04:19,2018-12-06 17:04:19,2.0,True
311832,2020-08-05 00:52:42,00:52:42,storytelling/creating-stories,11.0,28.0,76.185.145.231,28.0,Staff,#,2014-02-04,2014-02-04,2018-12-06 17:04:19,2018-12-06 17:04:19,2.0,True
311929,2020-08-05 11:06:05,11:06:05,slides/form,428.0,28.0,70.121.183.95,28.0,Staff,#,2014-02-04,2014-02-04,2018-12-06 17:04:19,2018-12-06 17:04:19,2.0,True
311930,2020-08-05 11:06:09,11:06:09,slides/formmodelbinding,428.0,28.0,70.121.183.95,28.0,Staff,#,2014-02-04,2014-02-04,2018-12-06 17:04:19,2018-12-06 17:04:19,2.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
898312,2021-04-20 10:35:18,10:35:18,dataframes,895.0,137.0,96.8.253.119,137.0,Florence,#florence,2021-03-15,2021-09-03,2021-03-15 18:18:20,2021-03-15 18:18:20,3.0,True
898313,2021-04-20 10:35:22,10:35:22,dataframes,895.0,137.0,96.8.253.119,137.0,Florence,#florence,2021-03-15,2021-09-03,2021-03-15 18:18:20,2021-03-15 18:18:20,3.0,True
898314,2021-04-20 10:36:05,10:36:05,dataframes,895.0,137.0,96.8.253.119,137.0,Florence,#florence,2021-03-15,2021-09-03,2021-03-15 18:18:20,2021-03-15 18:18:20,3.0,True
898321,2021-04-20 10:37:37,10:37:37,dataframes,895.0,137.0,96.8.253.119,137.0,Florence,#florence,2021-03-15,2021-09-03,2021-03-15 18:18:20,2021-03-15 18:18:20,3.0,True


In [98]:
df[mask].path.value_counts()

stats/hypothesis-testing-overview     420
fundamentals/spreadsheets-overview    370
classification/knn                    348
classification/random-forests         345
stats/overview                        312
                                     ... 
mysql/java-iii                          1
slides/css                              1
sgithubtudents/1215                     1
examples/html/welcome                   1
where                                   1
Name: path, Length: 457, dtype: int64

In [83]:
once.values

array(['storytelling/creating-custom-fields',
       'storytelling/creating-dashboards',
       'storytelling/creating-stories', 'slides/form',
       'slides/formmodelbinding', 'slides/jpa', 'sw-project-planning',
       'repo-doc', 'storytelling/bad-charts',
       'storytelling/misleading1_baseball.jpg',
       'storytelling/misleading1_fox.jpg',
       'storytelling/misleading3_deaths.jpg',
       'javascript-i/introduction/working-with-data-types-operators-and-variables/600%20Navarro%20St.,%20San%20Antonio,%20TX%2078205,%20United%20States%20of%20America',
       'javascript-i/introduction/working-with-data-types-operators-and-variables/google.com',
       'series', 'dataframes', 'pandas-overview',
       'nlp/regular-expressions',
       'professional-development/t-block-resume',
       'appendix/further-reading/html-css',
       'fundamentals/spreadsheets-overview',
       'fundamentals/visualization-with-excel', 'storytelling/tableau',
       'storytelling/refine', 'storytelling

In [79]:
df.accessed_once.sum()

8204

In [81]:
df[df.accessed_once == True].path.value_counts()

stats/hypothesis-testing-overview     420
fundamentals/spreadsheets-overview    370
classification/knn                    348
classification/random-forests         345
stats/overview                        312
                                     ... 
mysql/java-iii                          1
slides/css                              1
sgithubtudents/1215                     1
examples/html/welcome                   1
where                                   1
Name: path, Length: 457, dtype: int64

# CONCLUSION:

> * **450** of the **2300** **(~20%)** pages have been accessed only once

In [142]:
word_counts = {}


for path in df.path.value_counts().index:
    word = str(path)
    for elem in word.split('/'):
        if len(elem) > 0 and elem not in list(word_counts.keys()):
            word_counts[elem] = 1
        elif len(elem) > 0:
            word_counts[elem] += 1
            
word_count = pd.Series(word_counts, name='word_count').sort_values(ascending=False)
word_count

content                        337
appendix                       222
html-css                        77
mysql                           77
examples                        77
                              ... 
13.3_Refine                      1
bootstrap-introduction.html      1
styling-webpages                 1
9.20_Data                        1
2-sql                            1
Name: word_count, Length: 1614, dtype: int64

In [128]:
df.path.head()

0                                      /
1                                java-ii
2    java-ii/object-oriented-programming
3     slides/object_oriented_programming
4                 javascript-i/functions
Name: path, dtype: object

In [69]:
first_word = [str(path).split('/')[0] for path in df.path.value_counts().index]
second_word = [str(path).split('/')[1] for path in df.path.value_counts().index if len(str(path).split('/'))>1]
third_word = [str(path).split('/')[2] for path in df.path.value_counts().index if len(str(path).split('/'))>2]
fourth_word = [str(path).split('/')[3] for path in df.path.value_counts().index if len(str(path).split('/'))>3]
fifth_word = [str(path).split('/')[4] for path in df.path.value_counts().index if len(str(path).split('/'))>4]
sixth_word = [str(path).split('/')[5] for path in df.path.value_counts().index if len(str(path).split('/'))>5]
seventh_word = [str(path).split('/')[6] for path in df.path.value_counts().index if len(str(path).split('/'))>6]

word_list = [first_word, second_word, third_word, fourth_word, fifth_word, sixth_word, seventh_word]

def flatten(a_list):
    return [word for words in word_list for word in words]

separators = ['.', '-', '_']


exercise = ['exercise', 'Exercise', 'challenge', 'practice', 'Challenge', 'Practice',
            'problem', 'Problem', 'question', 'Question']

programming = ['sql', 'python', 'java', 'Java', 'Python', 'SQL', 'Sql', 'ruby', 'Ruby', 'c++', 'C++',
              'programming', 'Programming', 'functions', 'Functions', 'import', 'Import', 'git', 'Git', 'terminal',
              'jupyter', 'Jupyter', 'pandas', 'Pandas', 'acquire', 'prepare']

math = ['stat', 'Stat', 'proba', 'Proba']

math_count = 0
exercise_count = 0
programming_count = 0

flattened = set(flatten(word_list))

for ex in exercise:
    for word in flattened:
        if ex in word:
            exercise_count+=1
            
for mth in math:
    for word in flattened:
        if mth in word:
            math_count +=1
            
for pro in programming:
    for word in flattened:
        if pro in word:
            programming_count+=1
            
print(f'exercise count: {exercise_count}')
print(f'programming count: {programming_count}')
print(f'math count: {math_count}')
for word in flattened:
    if 'question' in word or 'Question' in word:
        print(word)
    if 'prepare' in word:
        print(word)

exercise count: 45
programming count: 123
math count: 60
mock-behavioral-question
prepared-statements
interview_questions_students
interview-questions
interview_questions
mock-behavioral-questions
prepared-statements.html
interview-questions-behavioral
6.3-prepare-part-3
6.1-prepare-part-1
4-prepare
6.2-prepare-part-2
interview-questions-students
prepare
interview_questions_instructors
interview-questions-tech
6.4-prepare-part-4


In [39]:
once

0            storytelling/creating-custom-fields
1               storytelling/creating-dashboards
2                  storytelling/creating-stories
3                                    slides/form
4                        slides/formmodelbinding
                         ...                    
452    diagram-of-ds-pipeline-fraud-example.jpeg
453                     data-science-modules.jpg
454                 ml-methodologies-drawing.jpg
455                              case-statements
456                                        where
Length: 457, dtype: object

In [156]:
resources = ['jpg', 'jpeg', 'png', 'pdf', 'slides', 'doc', 'chart', 'gif']

presentation = ['story', 'stories', 'present']
programming = ['java', 'python', 'git', 'cloud']

def search_once(once, category):
    count = 0
    paths_in_category = []
    
    for word in category:
        
        for path in once.values:
            if word in str(path).lower():
                count+= 1
                paths_in_category.append(str(path).lower())
                
    print(f'{category}: {count}')
    return paths_in_category

def subset_once(lst_of_categories, once):
    flattened = flatten(lst_of_categories)
    all_

search_once(once, resources)
search_once(once, presentation)
search_once(once, programming)


['jpg', 'jpeg', 'png', 'pdf', 'slides', 'doc', 'chart', 'gif']: 31
['story', 'stories', 'present']: 18
['java', 'python', 'git', 'cloud']: 61


['javascript-i/introduction/working-with-data-types-operators-and-variables/600%20navarro%20st.,%20san%20antonio,%20tx%2078205,%20united%20states%20of%20america',
 'javascript-i/introduction/working-with-data-types-operators-and-variables/google.com',
 'javascript-i/dom',
 'javascript-i/loops/google.com',
 'appendix/further-reading/java/intellij',
 'javascript-i/operators',
 'appendix/extra-exercises/javascript',
 'appendix/java/intellij-tomcat-configuration',
 'java-i/control-stateedabitments-and-loops',
 'java-iii/mvc.jsp',
 'mysql/java-iii',
 'appendix/further-reading/javascript',
 'javascript-i/bom-and-dom/bom/http%22://www.google.com',
 'javascript-i/bom-and-dom/bom/google.com',
 'java-1',
 'further-reading/java/intellij-tomcat-configuration',
 'javascript-i/math',
 'java-i/control-statements-and-loops/g',
 'javascript/functions',
 'javascript',
 'appendix/java',
 'javascri',
 'javascript/apendix',
 'javascript-i/primitive-types',
 'further-reading/javascript/array-splice',
 'extr

In [190]:
resources = []
endpoints = []
lessons = []
for val in once.values:
    resources.append(str(val).split('/')[0])
    endpoints.append(str(val).split('/')[1:])
    if len(str(val).split('/')) > 2:
        lessons.append(str(val).split('/')[2])

res_df = pd.Series(resources, name='resources').value_counts()
endpoint_df = pd.Series(['/'.join(end) for end in endpoints], name='endpoint')
lesson_df = pd.Series(lessons, name='lessons')
endpoint_df

0      creating-custom-fields
1         creating-dashboards
2            creating-stories
3                        form
4            formmodelbinding
                ...          
452                          
453                          
454                          
455                          
456                          
Name: endpoint, Length: 457, dtype: object

In [191]:
endpoint_df[endpoint_df.values != ''].head(20)

0                                creating-custom-fields
1                                   creating-dashboards
2                                      creating-stories
3                                                  form
4                                      formmodelbinding
5                                                   jpa
8                                            bad-charts
9                              misleading1_baseball.jpg
10                                  misleading1_fox.jpg
11                               misleading3_deaths.jpg
12    introduction/working-with-data-types-operators...
13    introduction/working-with-data-types-operators...
17                                  regular-expressions
18                                       t-block-resume
19                             further-reading/html-css
20                                spreadsheets-overview
21                             visualization-with-excel
22                                              

In [192]:
(res_df > 1).sum()

46

In [193]:
(res_df == 1).sum()

163

In [194]:
res_df.head(20)

appendix                    45
html-css                    18
javascript-i                16
storytelling                13
fundamentals                13
regression                  12
mysql                       10
python                      10
examples                    10
classification              10
nlp                          9
clustering                   8
timeseries                   7
stats                        7
jquery                       6
slides                       6
extra-challenges             6
distributed-ml               6
cli                          5
professional-development     5
Name: resources, dtype: int64

In [195]:
data_science = ['slides', 'storytelling', 'fundamentals', 'regression', 'mysql', 'python', 'classification', 'examples', 'professional-development', 'nlp', 'clustering', 'timeseries', 'stats']
web_dev = ['examples', 'professional_development', 'jquery', 'html-css', 'appendix', 'java', 'slides']

In [199]:
lesson_df.unique()[lesson_df.value_counts() == 1]

array(['creating-files-and-directories', 'moving-files', 'more-topics',
       ':view-page', 'host-wildcards', 'interview-guidance',
       'interview-questions-tech', 'mysql', 'post-interview-review-form',
       'clauses', '..%2f', '..%5c', '..%c0%af', '..%255c',
       '%252e%252e%255c', 'google.com', 'requests-and-responses', 'notes',
       'java', 'php', 'form-model-binding', 'favicon.ico', 'javascript',
       'images', 'flexbox-fundamentals', 'css-grid-fundamentals',
       'css-grid-intro', 'layout', 'intellij-tomcat-configuration',
       '2-listing-files', '2-Overview', '2-overview', '7-more-topics',
       'null', 'navigating-the-filesystem.md', 'extra-exercises',
       'sample-database', 'database-design', 'ntellij', 'welcome', 'bom',
       'relationships', 'positioning', 'security-use-cases',
       'www.qlik.com', 'github.com', 'www.opensecrets.org',
       'www.followthemoney.org', 'www.census.gov', 'grid', '03-filepaths',
       'flexbox-additional-concepts', 'boostr

In [205]:
len(df.ip.unique())

5532

In [231]:
import requests
import os


def scrape_ip_locations(df, index_num=0):
    """ needs original log df (900000 rows), index_num = n where n is which ip address index you want to start at."""
    locations = []
    i=0
    
    filename = 'ip_geographical_data.csv'
    
    if os.path.isfile(filename):
        result_df = pd.read_csv(filename)
    else:
        result_df = pd.DataFrame()
    
    # add bracket after 'ip_list' with number to add to df
    # 684
    
    ip_list = df.ip.unique()[index_num:]
    for ip in ip_list:
        url = f'http://ipinfo.io/{ip}/json'
        data = requests.get(url).json()

        i+=1
        print(f'\r{i}', end='')
        try:
            IP=data['ip']
            org=data['org']
            city = data['city']
            country=data['country']
            region=data['region']
            locations.append({'ip':IP, 'org':org, 'city':city, 'country':country, 'region':region})
        except:
            print(data)
            
    result_df = pd.concat([result_df,pd.DataFrame(locations)])
    result_df = result_df.drop_duplicates()
    
    result_df.to_csv(filename, index=False)

    return result_df


In [223]:
locations=pd.DataFrame(locations)
locations.head(10)

Unnamed: 0,ip,org,city,country,region
0,97.105.19.61,AS11427 Charter Communications Inc,Houston,US,Texas
1,72.177.226.58,AS11427 Charter Communications Inc,San Antonio,US,Texas
2,173.173.119.75,AS11427 Charter Communications Inc,San Antonio,US,Texas
3,172.56.15.50,"AS21928 T-Mobile USA, Inc.",Houston,US,Texas
4,172.56.15.203,"AS21928 T-Mobile USA, Inc.",Houston,US,Texas
5,70.123.224.242,AS11427 Charter Communications Inc,Schertz,US,Texas
6,172.56.15.6,"AS21928 T-Mobile USA, Inc.",Houston,US,Texas
7,47.189.243.89,"AS5650 Frontier Communications of America, Inc.",Carrollton,US,Texas
8,172.56.15.15,"AS21928 T-Mobile USA, Inc.",Houston,US,Texas
9,172.56.15.46,"AS21928 T-Mobile USA, Inc.",Houston,US,Texas


In [225]:
locations.shape

(684, 5)

In [230]:
locations.to_csv('ip_geographical_data.csv', index=False)