# EventXtract MOOC2015 : Data Preprocessing

In this section you will preprocess the data to create a better representation of learners by performing a scaling on the data and detecting (and optionally removing) outliers. Preprocessing data is often times a critical step in assuring that results you obtain from your analysis are significant and meaningful.

In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
pd.options.display.max_colwidth=1000

In [2]:
mooc = pd.read_csv('./Input/StatLearning_2015_EventXtract_filter_date.csv',sep=",",index_col=False,error_bad_lines=False)
mooc['time']=pd.to_datetime(mooc['time'])
mooc.sort_values("time",inplace=True)
mooc.drop('Unnamed: 0', axis=1, inplace=True)
mooc.drop('quarter', axis=1, inplace=True)
mooc.drop('course_display_name', axis=1, inplace=True)
mooc.set_index('anon_screen_name',inplace=True)
mooc['events']=mooc['event_type']
mooc.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0_level_0,event_type,ip_country,time,resource_display_name,success,video_code,video_current_time,video_speed,video_old_time,video_new_time,video_seek_type,video_new_speed,video_old_speed,goto_from,goto_dest,events
anon_screen_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
9a3d3f219e4cc1122e929e2daa4a7afcfc6ec2d9,about,GBR,2015-01-19 00:02:43,,,,,,,,,,,-1,-1.0,about
d39809aa673c09a836815d09b44f7ca9c9bed463,about,ITA,2015-01-19 00:02:49,,,,,,,,,,,-1,-1.0,about
d39809aa673c09a836815d09b44f7ca9c9bed463,edx.course.enrollment.activated,ITA,2015-01-19 00:03:33,,,,,,,,,,,-1,-1.0,edx.course.enrollment.activated
34a807d69e6169b69c21572dbc45524b4384aadf,about,EGY,2015-01-19 00:03:44,,,,,,,,,,,-1,-1.0,about
34a807d69e6169b69c21572dbc45524b4384aadf,about,EGY,2015-01-19 00:06:34,,,,,,,,,,,-1,-1.0,about


In [3]:
mooc.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18475724 entries, 9a3d3f219e4cc1122e929e2daa4a7afcfc6ec2d9 to e17c708d47c8513691d0d09cb29c5aa968317cd3
Data columns (total 16 columns):
event_type               object
ip_country               object
time                     datetime64[ns]
resource_display_name    object
success                  object
video_code               object
video_current_time       object
video_speed              object
video_old_time           object
video_new_time           float64
video_seek_type          object
video_new_speed          float64
video_old_speed          object
goto_from                int64
goto_dest                float64
events                   object
dtypes: datetime64[ns](1), float64(3), int64(1), object(11)
memory usage: 2.3+ GB


# Memory optimisation

In [4]:
mooc.ip_country=mooc.ip_country.astype('category')

# Session

In [5]:
mooc.reset_index(level=0, inplace=True)
mooc['time_diff'] = mooc.groupby('anon_screen_name')['time'].diff()
mooc['event_duration'] = mooc.groupby('anon_screen_name')['time_diff'].shift(-1)
mooc['event_duration'] = mooc['event_duration'].dt.total_seconds()
mooc['nbr_session'] = mooc.groupby('anon_screen_name')['event_duration'].apply(
        lambda s: (s.shift() > 1800).fillna(0).cumsum(skipna=False)
)

In [6]:
mooc.tail()

Unnamed: 0,anon_screen_name,event_type,ip_country,time,resource_display_name,success,video_code,video_current_time,video_speed,video_old_time,video_new_time,video_seek_type,video_new_speed,video_old_speed,goto_from,goto_dest,events,time_diff,event_duration,nbr_session
18475719,e17c708d47c8513691d0d09cb29c5aa968317cd3,/courses/HumanitiesandScience/StatLearning/Winter2015/xblock/i4x:;_;_HumanitiesandScience;_StatLearning;_video;_d24492910eaf405ebfb3515289d82236/handler/xmodule_handler/save_user_state,KOR,2015-04-05 23:59:57,Video,,,,,,,,,,-1,-1.0,/courses/HumanitiesandScience/StatLearning/Winter2015/xblock/i4x:;_;_HumanitiesandScience;_StatLearning;_video;_d24492910eaf405ebfb3515289d82236/handler/xmodule_handler/save_user_state,00:00:00,2.0,38
18475720,e17c708d47c8513691d0d09cb29c5aa968317cd3,/courses/HumanitiesandScience/StatLearning/Winter2015/xblock/i4x:;_;_HumanitiesandScience;_StatLearning;_video;_3d23493a06cc42c0bc83dfd50502990d/handler/transcript/translation/en,KOR,2015-04-05 23:59:59,Video,,,,,,,,,,-1,-1.0,/courses/HumanitiesandScience/StatLearning/Winter2015/xblock/i4x:;_;_HumanitiesandScience;_StatLearning;_video;_3d23493a06cc42c0bc83dfd50502990d/handler/transcript/translation/en,00:00:02,0.0,38
18475721,e17c708d47c8513691d0d09cb29c5aa968317cd3,load_video,KOR,2015-04-05 23:59:59,Video,,QlyROnAjnEk,,,,,,,,-1,-1.0,load_video,00:00:00,0.0,38
18475722,e17c708d47c8513691d0d09cb29c5aa968317cd3,show_transcript,KOR,2015-04-05 23:59:59,,,,,,,,,,,-1,-1.0,show_transcript,00:00:00,1.0,38
18475723,e17c708d47c8513691d0d09cb29c5aa968317cd3,/courses/HumanitiesandScience/StatLearning/Winter2015/xblock/i4x:;_;_HumanitiesandScience;_StatLearning;_video;_3d23493a06cc42c0bc83dfd50502990d/handler/xmodule_handler/save_user_state,KOR,2015-04-06 00:00:00,Video,,,,,,,,,,-1,-1.0,/courses/HumanitiesandScience/StatLearning/Winter2015/xblock/i4x:;_;_HumanitiesandScience;_StatLearning;_video;_3d23493a06cc42c0bc83dfd50502990d/handler/xmodule_handler/save_user_state,00:00:01,,38


#  Events

## courseware cleaning

In [7]:
df = mooc[mooc.event_type.str.contains('/courseware/')]
courseware_df=df.event_type.value_counts().to_frame()
courseware_df.sort_values("event_type",ascending=False,inplace=True)
courseware_df.reset_index(level=0, inplace=True)
courseware_df=courseware_df.rename(columns={"index": "event_type", "event_type": "nbr_occ"})
courseware_df.tail(5)

Unnamed: 0,event_type,nbr_occ
750,/courses/HumanitiesandScience/StatLearning/Winter2015/courseware/8878fb6f600042fe98d774e0db26f87a/cc6dccd75dd041128f9c60ea311915c3/footer-stanford-logo@2x.png,1
751,/courses/HumanitiesandScience/StatLearning/Winter2015/courseware/8878fb6f600042fe98d774e0db26f87a/b91ee2b82a6d49eb91e1dc6641cf5efe/linear-gradient(rgba(251,1
752,/courses/HumanitiesandScience/StatLearning/Winter2015/courseware/4cd5971758e84840b24d91c763df6ce8/e9481751b91d4f25b7a6d98fa5b7d371/stanford-s-logo.ac1a37d62c92.png,1
753,/courses/HumanitiesandScience/StatLearning/Winter2015/courseware/8878fb6f600042fe98d774e0db26f87a/c6fe7f06d25c4d3786e2f57a05dc0992/0.0980392),1
754,/courses/HumanitiesandScience/StatLearning/Winter2015/courseware/8878fb6f600042fe98d774e0db26f87a/b6580576409e4431b3a898d8122ca37e/251,1


In [8]:
courseware_df.head()

Unnamed: 0,event_type,nbr_occ
0,/courses/HumanitiesandScience/StatLearning/Winter2015/courseware/995220423fd14a4588d8e47920f1b5df/1a812f9f556b44109ce6b40178e52e4d/,61309
1,/courses/HumanitiesandScience/StatLearning/Winter2015/courseware/f6eb0a2902904c6e8f74a2c15833d1ad/dbbe6095fa55431b8253590e2658dc3e/,56300
2,/courses/HumanitiesandScience/StatLearning/Winter2015/courseware/f6eb0a2902904c6e8f74a2c15833d1ad/00c820e0c96d45209a8cd3c15d63c294/,54642
3,/courses/HumanitiesandScience/StatLearning/Winter2015/courseware/f6eb0a2902904c6e8f74a2c15833d1ad/,54355
4,/courses/HumanitiesandScience/StatLearning/Winter2015/courseware/995220423fd14a4588d8e47920f1b5df/99faa3a82fca4fc19adc577ce9f75afd/,46192


In [9]:
mask_file=mooc.event_type.isin(courseware_df[courseware_df.nbr_occ<20].event_type)
mooc.drop(mooc.ix[mask_file].index,inplace=True)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


In [10]:
mooc.shape

(18474651, 20)

## Courseware mapping

In [11]:
mask_t = mooc.event_type.str.contains('/courseware/')
mooc.ix[mask_t,'event_type']=mooc.ix[mask_t,'event_type'].apply(lambda x:"courseware_" + x.split("/")[-2])

mask_cr=mooc.event_type.str.contains('/courseware')
mooc.ix[mask_cr,'event_type']="courseware_menu"

#### Chapter

In [12]:
chapters=['courseware_995220423fd14a4588d8e47920f1b5df',
          'courseware_f6eb0a2902904c6e8f74a2c15833d1ad',
          'courseware_85b01caa12834b0dbaeff232fb77e123',
          'courseware_41ce0170b29f43ab9d490b5f37d16fdf',
          'courseware_9956347366744e1cac95f513e9235f9f',
          'courseware_8878fb6f600042fe98d774e0db26f87a',
          'courseware_43d59889973b4b34a7070918f2a7bb3f',
          'courseware_4cd5971758e84840b24d91c763df6ce8',
          'courseware_dfece96897994039a17547b575573447',
          'courseware_b6e69c3c9239444085f6a8a186cab4cc']
for chapter in chapters:
    mask_cr=mooc.event_type==chapter
    print(mooc[mask_cr].event_type.value_counts())

courseware_995220423fd14a4588d8e47920f1b5df    27560
Name: event_type, dtype: int64
courseware_f6eb0a2902904c6e8f74a2c15833d1ad    54355
Name: event_type, dtype: int64
courseware_85b01caa12834b0dbaeff232fb77e123    34322
Name: event_type, dtype: int64
courseware_41ce0170b29f43ab9d490b5f37d16fdf    23870
Name: event_type, dtype: int64
courseware_9956347366744e1cac95f513e9235f9f    19303
Name: event_type, dtype: int64
courseware_8878fb6f600042fe98d774e0db26f87a    17754
Name: event_type, dtype: int64
courseware_43d59889973b4b34a7070918f2a7bb3f    10156
Name: event_type, dtype: int64
courseware_4cd5971758e84840b24d91c763df6ce8    10491
Name: event_type, dtype: int64
courseware_dfece96897994039a17547b575573447    8943
Name: event_type, dtype: int64
courseware_b6e69c3c9239444085f6a8a186cab4cc    9055
Name: event_type, dtype: int64


In [13]:
chapters=['courseware_995220423fd14a4588d8e47920f1b5df',
          'courseware_f6eb0a2902904c6e8f74a2c15833d1ad',
          'courseware_85b01caa12834b0dbaeff232fb77e123',
          'courseware_41ce0170b29f43ab9d490b5f37d16fdf',
          'courseware_9956347366744e1cac95f513e9235f9f',
          'courseware_8878fb6f600042fe98d774e0db26f87a',
          'courseware_43d59889973b4b34a7070918f2a7bb3f',
          'courseware_4cd5971758e84840b24d91c763df6ce8',
          'courseware_dfece96897994039a17547b575573447',
          'courseware_b6e69c3c9239444085f6a8a186cab4cc']
for chapter in chapters:
    mask=mooc.event_type==chapter
    mooc.ix[mask,'event_type']="chapter_" + str(chapters.index(chapter)+1)

In [14]:
mask_cr=mooc.event_type=='chapter_9'
mooc[mask_cr].event_type.value_counts()

chapter_9    8943
Name: event_type, dtype: int64

#### Sequential

In [15]:
#Chapter 1 --> Sequential
Sequentials=['courseware_99faa3a82fca4fc19adc577ce9f75afd',
              'courseware_1a812f9f556b44109ce6b40178e52e4d',
              'courseware_39d18e30d70e4c889bdb472bb70f90b0']
for Sequential in Sequentials:
    mask=mooc.event_type==Sequential
    mooc.ix[mask,'event_type']="ch1_Seq" + str(Sequentials.index(Sequential)+1)
#--------------------------------------------------------------------------------------------------
#Chapter 2 --> Sequential
Sequentials=['courseware_dbbe6095fa55431b8253590e2658dc3e',
              'courseware_00c820e0c96d45209a8cd3c15d63c294',
              'courseware_97f87bf36b4a4eeb90e1ca1171fb6750',
              'courseware_72254d647bc74a989901d1b67aefd429',
              'courseware_d4d18f0dda714d94b8687b2ab0b2dcec',
              'courseware_cd02bd9aa5a84fa6b338546a480e3c3b',
              'courseware_bdc0347d5ef74b47bd60c93eae84b924']
for Sequential in Sequentials:
    mask=mooc.event_type==Sequential
    mooc.ix[mask,'event_type']="ch2_Seq" + str(Sequentials.index(Sequential)+1)
#--------------------------------------------------------------------------------------------------    
#Chapter 3 --> Sequential
Sequentials=['courseware_441b6190c14e46c29493ae9dcc41eb05',
              'courseware_a728c0772dfd494495aaf8914d59fb84',
              'courseware_35b30934bfd948a188ddb93cfd532a95',
              'courseware_bfb7cfefe2a94475812ac1e338460f9e',
              'courseware_86583341040b45ed9889e6056db856d6',
              'courseware_00b646c1309e42f2b8c1a770516fdef4',
              'courseware_59079702cb414b35b0b2c5edcd752331']
for Sequential in Sequentials:
    mask=mooc.event_type==Sequential
    mooc.ix[mask,'event_type']="ch3_Seq" + str(Sequentials.index(Sequential)+1)
#--------------------------------------------------------------------------------------------------
#Chapter 4 --> Sequential
Sequentials=['courseware_d52d8c1e3e9a439b968d9d016262a257',
              'courseware_f6fd9737f77847718e7cf6497ba0539f',
              'courseware_b31ca38fdedd4841b964a92a3966c352',
              'courseware_b01ac47d032b4d38aed7d93108a673bf',
              'courseware_6a799584d6824c3ba1c1db1ea35de419',
              'courseware_59132b68ab7642b694e53d1171eb6a5a',
              'courseware_564c63c594e343fd916acb023054c5f5',
              'courseware_3e615c5c84894a49bbb6347c353f3a83',
              'courseware_bbbc5533cd31451183d116f262c100ea',
              'courseware_4066183e3bac4f9396d932e0970163bf']
for Sequential in Sequentials:
    mask=mooc.event_type==Sequential
    mooc.ix[mask,'event_type']="ch4_Seq" + str(Sequentials.index(Sequential)+1)
#--------------------------------------------------------------------------------------------------    
#Chapter 5 --> Sequential
Sequentials=['courseware_0c365a9e13094cb89facf1b9e6e02843',
              'courseware_ed057ff7b3fc49c591df885e42cf0eb8',
              'courseware_8938de1a9b8f494ea1cf5cb81c0e8947',
              'courseware_34a08990ca204413a090d58ab92c22aa',
              'courseware_045a39ba8c7749498408cf65706be51f',
              'courseware_3917fe87efba4f9ea4908b9e316d5fe0',
              'courseware_9339db132f504f809056f221271d9795',
              'courseware_8f5ce90bd80340019b2c38e6fd42ab82']
for Sequential in Sequentials:
    mask=mooc.event_type==Sequential
    mooc.ix[mask,'event_type']="ch5_Seq" + str(Sequentials.index(Sequential)+1)
#--------------------------------------------------------------------------------------------------    
#Chapter 6 --> Sequential    
Sequentials=[ 'courseware_4654e3322a044c91ab1dd0690c454922',
              'courseware_a200b5393d7246a78991d9b5a428cce7',
              'courseware_b6580576409e4431b3a898d8122ca37e',
              'courseware_c6fe7f06d25c4d3786e2f57a05dc0992',
              'courseware_b010c724dac14079a5610ea8ff531813',
              'courseware_c5b82b99d36f4b97b6d6c5140cc96fb7',
              'courseware_b91ee2b82a6d49eb91e1dc6641cf5efe',
              'courseware_99c04c866a9241b59736e03a1915829f',
              'courseware_cc6dccd75dd041128f9c60ea311915c3',
              'courseware_77d7b309956c42f28e116562b427ec7b',
              'courseware_d04d935adad54682a6d2951ea316d294',
              'courseware_945b8384b947450788abb8d0f6e88235',
              'courseware_0fb46606cdd94708b9fcfef4efbb21d1']
for Sequential in Sequentials:
    mask=mooc.event_type==Sequential
    mooc.ix[mask,'event_type']="ch6_Seq" + str(Sequentials.index(Sequential)+1)
#--------------------------------------------------------------------------------------------------
#Chapter 7 --> Sequential 
Sequentials=[ 'courseware_23960f99395f437bab4d19c7eaf523d0',
              'courseware_06b9be5c96eb42ea83dc7f057cae4f35',
              'courseware_4e220b66424f48bda59cac974565cf93',
              'courseware_a071757474484065a3e2a88575fa6e12',
              'courseware_10d2c51f157d456bb6c9a66fababbe16',
              'courseware_0fa2d1b669d64d4388109a95591250dc']
for Sequential in Sequentials:
    mask=mooc.event_type==Sequential
    mooc.ix[mask,'event_type']="ch7_Seq" + str(Sequentials.index(Sequential)+1)
#--------------------------------------------------------------------------------------------------    
#Chapter 8 --> Sequential 
Sequentials=[ 'courseware_183f7baba27e46e9b9e46cd64a3ddbf5',
              'courseware_bb3f21ec63554788b9a3023c22a98118',
              'courseware_b55e5d69a4da4959934865716a0f8491',
              'courseware_6ad06d5d9c5740c2ade97b311e501331',
              'courseware_c08295503f0547ab819d092258701aca',
              'courseware_1bbe67e98ce64f5a966ef67e1d41ceaf',
              'courseware_e9481751b91d4f25b7a6d98fa5b7d371',
              'courseware_cd47429ce9e74315a5c53630c5db9f9f']
for Sequential in Sequentials:
    mask=mooc.event_type==Sequential
    mooc.ix[mask,'event_type']="ch8_Seq" + str(Sequentials.index(Sequential)+1)
#--------------------------------------------------------------------------------------------------    
#Chapter 9 --> Sequential  
Sequentials=[ 'courseware_9a928da468294f57800857c9e7e928f3',
              'courseware_4670b6eaffa84b40af6288bdbafde38e',
              'courseware_2a78272c6c464c03b54e2352fab39f94',
              'courseware_b43d493911fb4f9cbc244162a577177b',
              'courseware_7eff0af1291548b680aef4755de24388',
              'courseware_5c277deedff041cda22fe6a05a25bea3']
for Sequential in Sequentials:
    mask=mooc.event_type==Sequential
    mooc.ix[mask,'event_type']="ch9_Seq" + str(Sequentials.index(Sequential)+1)
#--------------------------------------------------------------------------------------------------    
#Chapter 10 --> Sequential   
Sequentials=[ 'courseware_005632d64eec4fe293d6c346ed364ecc',
              'courseware_d5befef2249f4fecab75d4be9210a980',
              'courseware_d9fb466566a346279a658adc50a4b450',
              'courseware_fed5d97f0a5d484d85fdd7ed5d2ebb1a',
              'courseware_73edc3c8a5a345df8f93cd869a70c0eb',
              'courseware_7c87061bd65545dd82377ae89cfa4a5a',
              'courseware_5dcb73f4d7f64d8b8c77b42adf29fab0']
for Sequential in Sequentials:
    mask=mooc.event_type==Sequential
    mooc.ix[mask,'event_type']="ch10_Seq" + str(Sequentials.index(Sequential)+1)
#--------------------------------------------------------------------------------------------------  
#Chapter finiching up --> Sequential
Sequentials=['courseware_894e342c6bfc4890bdd326b000705532']
for Sequential in Sequentials:
    mask=mooc.event_type==Sequential
    mooc.ix[mask,'event_type']="ch11_Seq" + str(Sequentials.index(Sequential)+1)

## Events detection

In [16]:
mask_p=mooc.event_type.str.contains(';_problem')
mooc.ix[mask_p,'event_type']=mooc.ix[mask_p,'event_type'].str.split('/').str.get(-1)

mask_s=mooc.event_type.str.contains(';_sequential;')
mooc.ix[mask_s,'event_type']=mooc.ix[mask_s,'event_type'].str.split('/').str.get(-1)

mask_v=mooc.event_type.str.contains(';_video;')
mooc.ix[mask_v,'event_type']=mooc.ix[mask_v,'event_type'].str.split('/').str.get(-1)

In [17]:
mask=mooc.event_type.str.contains('/discussion/comments/')
mooc.ix[mask,'event_type']=mooc.ix[mask,'event_type'].apply(lambda x:"comment_" + x.split("/")[-1])

mask_t=mooc.event_type.str.contains('/discussion/threads/')
mooc.ix[mask_t,'event_type']=mooc.ix[mask_t,'event_type'].apply(lambda x:"thread_" + x.split("/")[-1])

mask_f=mooc.event_type.str.contains('/discussion/forum/')& mooc.event_type.str.contains('/threads/')
mooc.ix[mask_f,'event_type']="view_post"

mask_u=mooc.event_type.str.contains('/discussion/forum/users/')
mooc.ix[mask_u,'event_type']=mooc.ix[mask_u,'event_type'].str.split('/').str.get(-1).apply(lambda x: 'user_followed' if x == 'followed' else 'user_view')

In [18]:
mooc.index.nunique()

18474651

In [19]:
mask_f=mooc.event_type.str.contains('/discussion/')& mooc.event_type.str.contains('/threads/create')
mooc.ix[mask_f,'event_type']="thread_create"

mask_f=mooc.event_type.str.contains('/discussion/forum/search')
mooc.ix[mask_f,'event_type']="forum_search"

mask_i=mooc.event_type.str.contains('/info|/Info')
mooc.ix[mask_i,'event_type']="info"

mask_pr=mooc.event_type.str.contains('/progres')
mooc.ix[mask_pr,'event_type']="progress"

mask_jp=mooc.event_type.str.contains('/jump_to/')
mooc.ix[mask_jp,'event_type']="Statement"

mask_s=mooc.event_type.str.contains('ba1951b8f66c4cdca2fcaabcdc91b792')
mooc.ix[mask_s,'event_type']="slides"

mask_s=mooc.event_type.str.contains('0d68641c19484ef9aa6aeea03426dc68')
mooc.ix[mask_s,'event_type']="R_sessions"


In [20]:
mask_s=mooc.event_type.str.contains('courseware_courseware')
mooc.ix[mask_s,'event_type']="courseware_menu"

mask_i=mooc.event_type.str.contains('/about|/About')
mooc.ix[mask_i,'event_type']="about"

In [21]:
mooc.shape

(18474651, 20)

## Remove noisy events

In [23]:
mask_file=mooc.event_type.str.contains('.png|.jpg|.gif|.css|.js|.php|.html|.xml|.txt|.ini')
mooc.drop(mooc.ix[mask_file].index,inplace=True)

In [24]:
mask_w=mooc.event_type.str.contains('Winter2015|/shib-login/|/accounts/')
mooc.ix[mask_w,'event_type']="to_drop"

In [25]:
mooc[mooc.event_type=="to_drop"].shape

(200163, 20)

In [26]:
mooc = mooc[~((mooc.event_type=='to_drop') | (mooc.event_type=='en') | (mooc.event_type=='input_ajax') | (mooc.event_type=='save_user_state'))]

In [27]:
mooc.shape

(14308273, 20)

## Export event

In [28]:
mooc_event=mooc.event_type.unique()
df_event=pd.DataFrame(mooc_event,columns=['event_type'])
df_event.sort_values(['event_type'],ascending=False,inplace=True)
ex=pd.ExcelWriter('mooc15_event_brut.xlsx')
df_event.to_excel(ex,sheet_name="mooc15_event")
ex.save()

# Weeks 
-  Week 1: Introduction and Overview of Statistical Learning (Chapters 1-2, starts Jan 19)
-  Week 2: Linear Regression (Chapter 3, starts Jan 24)
-  Week 3: Classification (Chapter 4, starts Jan 31)
-  Week 4: Resampling Methods (Chapter 5, starts Feb 7)
-  Week 5: Linear Model Selection and Regularization (Chapter 6, starts Feb 14)
-  Week 6: Moving Beyond Linearity (Chapter 7, starts Feb 21)
-  Week 7: Tree-based Methods (Chapter 8, starts Feb 28)
-  Week 8: Support Vector Machines (Chapter 9, starts Mar 7)
-  Week 9: Unsupervised Learning (Chapter 10, starts Mar 14)

In [29]:
mask_w1=mooc.event_type.isin(['chapter_1','ch1_Seq3','ch1_Seq2','ch1_Seq1','chapter_2','ch2_Seq7','ch2_Seq6','ch2_Seq5','ch2_Seq4','ch2_Seq3','ch2_Seq2','ch2_Seq1'])
mooc.ix[mask_w1,'week']=1

mask_w3=mooc.event_type.isin(['chapter_3',
'ch3_Seq7',
'ch3_Seq6',
'ch3_Seq5',
'ch3_Seq4',
'ch3_Seq3',
'ch3_Seq2',
'ch3_Seq1'])
mooc.ix[mask_w3,'week']=2

mask_w4=mooc.event_type.isin(['chapter_4',
'ch4_Seq9',
'ch4_Seq8',
'ch4_Seq7',
'ch4_Seq6',
'ch4_Seq5',
'ch4_Seq4',
'ch4_Seq3',
'ch4_Seq2',
'ch4_Seq10',
'ch4_Seq1'])
mooc.ix[mask_w4,'week']=3

mask_w5=mooc.event_type.isin(['chapter_5',
'ch5_Seq8',
'ch5_Seq7',
'ch5_Seq6',
'ch5_Seq5',
'ch5_Seq4',
'ch5_Seq3',
'ch5_Seq2',
'ch5_Seq1'])
mooc.ix[mask_w5,'week']=4

mask_w6=mooc.event_type.isin(['chapter_6',
'ch6_Seq9',
'ch6_Seq8',
'ch6_Seq7',
'ch6_Seq6',
'ch6_Seq5',
'ch6_Seq4',
'ch6_Seq3',
'ch6_Seq2',
'ch6_Seq13',
'ch6_Seq12',
'ch6_Seq11',
'ch6_Seq10',
'ch6_Seq1'])
mooc.ix[mask_w6,'week']=5

mask_w7=mooc.event_type.isin(['chapter_7',
'ch7_Seq6',
'ch7_Seq5',
'ch7_Seq4',
'ch7_Seq3',
'ch7_Seq2',
'ch7_Seq1'])
mooc.ix[mask_w7,'week']=6

mask_w8=mooc.event_type.isin(['chapter_8',
'ch8_Seq8',
'ch8_Seq7',
'ch8_Seq6',
'ch8_Seq5',
'ch8_Seq4',
'ch8_Seq3',
'ch8_Seq2',
'ch8_Seq1'])
mooc.ix[mask_w8,'week']=7

mask_w9=mooc.event_type.isin(['chapter_9',
'ch9_Seq6',
'ch9_Seq5',
'ch9_Seq4',
'ch9_Seq3',
'ch9_Seq2',
'ch9_Seq1'])
mooc.ix[mask_w9,'week']=8
         
mask_w10=mooc.event_type.isin(['chapter_10',
'ch10_Seq7',
'ch10_Seq6',
'ch10_Seq5',
'ch10_Seq4',
'ch10_Seq3',
'ch10_Seq2',
'ch10_Seq1'
])
mooc.ix[mask_w10,'week']=9         

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


# Organising Dataframe

In [30]:
mooc.head()

Unnamed: 0,anon_screen_name,event_type,ip_country,time,resource_display_name,success,video_code,video_current_time,video_speed,video_old_time,...,video_seek_type,video_new_speed,video_old_speed,goto_from,goto_dest,events,time_diff,event_duration,nbr_session,week
0,9a3d3f219e4cc1122e929e2daa4a7afcfc6ec2d9,about,GBR,2015-01-19 00:02:43,,,,,,,...,,,,-1,-1.0,about,NaT,65913.0,0,
1,d39809aa673c09a836815d09b44f7ca9c9bed463,about,ITA,2015-01-19 00:02:49,,,,,,,...,,,,-1,-1.0,about,NaT,44.0,0,
2,d39809aa673c09a836815d09b44f7ca9c9bed463,edx.course.enrollment.activated,ITA,2015-01-19 00:03:33,,,,,,,...,,,,-1,-1.0,edx.course.enrollment.activated,00:00:44,71802.0,0,
3,34a807d69e6169b69c21572dbc45524b4384aadf,about,EGY,2015-01-19 00:03:44,,,,,,,...,,,,-1,-1.0,about,NaT,170.0,0,
4,34a807d69e6169b69c21572dbc45524b4384aadf,about,EGY,2015-01-19 00:06:34,,,,,,,...,,,,-1,-1.0,about,00:02:50,19450.0,0,


In [31]:
# mooc.reset_index(level=0, inplace=True)
mooc=mooc[['anon_screen_name', 'week', 'event_type',
        'resource_display_name', 'time', 'event_duration','nbr_session',  'success', 'video_code',
        'video_current_time', 'video_speed', 'video_old_time', 'video_new_time','video_seek_type', 'video_new_speed', 'video_old_speed', 'goto_from','goto_dest', 'ip_country', 'events']]
mooc.set_index('anon_screen_name',inplace=True)

In [32]:
# mooc_processed=np.nan
# mooc[mooc.index=="d63ac91a4a5dd1039264c0c56e34ca2421e46ec6"]
mooc.tail()

Unnamed: 0_level_0,week,event_type,resource_display_name,time,event_duration,nbr_session,success,video_code,video_current_time,video_speed,video_old_time,video_new_time,video_seek_type,video_new_speed,video_old_speed,goto_from,goto_dest,ip_country,events
anon_screen_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
d63ac91a4a5dd1039264c0c56e34ca2421e46ec6,,info,,2015-04-05 23:59:46,,0,,,,,,,,,,-1,-1.0,TWN,/courses/HumanitiesandScience/StatLearning/Winter2015/info
e17c708d47c8513691d0d09cb29c5aa968317cd3,5.0,ch6_Seq10,Ch6 Linear Model Selection and Regularization,2015-04-05 23:59:56,1.0,38,,,,,,,,,,-1,-1.0,KOR,/courses/HumanitiesandScience/StatLearning/Winter2015/courseware/8878fb6f600042fe98d774e0db26f87a/77d7b309956c42f28e116562b427ec7b/
e17c708d47c8513691d0d09cb29c5aa968317cd3,,page_close,,2015-04-05 23:59:57,0.0,38,,,,,,,,,,-1,-1.0,KOR,page_close
e17c708d47c8513691d0d09cb29c5aa968317cd3,,load_video,Video,2015-04-05 23:59:59,0.0,38,,QlyROnAjnEk,,,,,,,,-1,-1.0,KOR,load_video
e17c708d47c8513691d0d09cb29c5aa968317cd3,,show_transcript,,2015-04-05 23:59:59,1.0,38,,,,,,,,,,-1,-1.0,KOR,show_transcript


# Session Géneration

In [33]:
# mooc.reset_index(level=0, inplace=True)
# mooc['time_diff'] = mooc.groupby('anon_screen_name')['time'].diff()
# mooc['event_duration'] = mooc.groupby('anon_screen_name')['time_diff'].shift(-1)
# mooc['event_duration'] = mooc['event_duration'].dt.total_seconds()
# mooc['nbr_session'] = mooc.groupby('anon_screen_name')['event_duration'].apply(
#         lambda s: (s.shift() > 2700).fillna(0).cumsum(skipna=False)
# )

In [34]:
#14308273
mooc.shape

(14308273, 19)

# Export to CSV

In [36]:
mooc.reset_index(level=0, inplace=True)
mooc.head()

Unnamed: 0,anon_screen_name,week,event_type,resource_display_name,time,event_duration,nbr_session,success,video_code,video_current_time,video_speed,video_old_time,video_new_time,video_seek_type,video_new_speed,video_old_speed,goto_from,goto_dest,ip_country,events
0,9a3d3f219e4cc1122e929e2daa4a7afcfc6ec2d9,,about,,2015-01-19 00:02:43,65913.0,0,,,,,,,,,,-1,-1.0,GBR,about
1,d39809aa673c09a836815d09b44f7ca9c9bed463,,about,,2015-01-19 00:02:49,44.0,0,,,,,,,,,,-1,-1.0,ITA,about
2,d39809aa673c09a836815d09b44f7ca9c9bed463,,edx.course.enrollment.activated,,2015-01-19 00:03:33,71802.0,0,,,,,,,,,,-1,-1.0,ITA,edx.course.enrollment.activated
3,34a807d69e6169b69c21572dbc45524b4384aadf,,about,,2015-01-19 00:03:44,170.0,0,,,,,,,,,,-1,-1.0,EGY,about
4,34a807d69e6169b69c21572dbc45524b4384aadf,,about,,2015-01-19 00:06:34,19450.0,0,,,,,,,,,,-1,-1.0,EGY,about


In [37]:
mooc.to_csv("EventXtract_MOOC2015_Preprocessed.csv",columns=['anon_screen_name', 'week', 'event_type',
        'resource_display_name', 'time', 'event_duration','nbr_session',  'success', 'video_code',
        'video_current_time', 'video_speed', 'video_old_time', 'video_new_time','video_seek_type', 
        'video_new_speed', 'video_old_speed', 'goto_from','goto_dest', 'ip_country', 'events'])