In [4]:
import pandas as pd

# Load the uploaded dataset to check its structure and contents
file_path = 'output_truncated.csv'
data = pd.read_csv(file_path)

# Display the first few rows and basic information about the dataset
data.head(), data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         10000 non-null  int64  
 1   institute          10000 non-null  object 
 2   course_id          10000 non-null  object 
 3   year               10000 non-null  int64  
 4   semester           10000 non-null  object 
 5   userid_DI          10000 non-null  object 
 6   viewed             10000 non-null  int64  
 7   explored           10000 non-null  int64  
 8   certified          10000 non-null  int64  
 9   final_cc_cname_DI  10000 non-null  object 
 10  LoE_DI             10000 non-null  object 
 11  gender             9722 non-null   object 
 12  grade              10000 non-null  float64
 13  start_time_DI      10000 non-null  object 
 14  last_event_DI      10000 non-null  object 
 15  nevents            10000 non-null  int64  
 16  ndays_act          1000

(   Unnamed: 0 institute course_id  year semester       userid_DI  viewed  \
 0           4  HarvardX    PH207x  2012     Fall  MHxPC130313697       0   
 1           6  HarvardX    PH207x  2012     Fall  MHxPC130237753       1   
 2           7  HarvardX     CS50x  2012   Summer  MHxPC130202970       1   
 3          20  HarvardX     CS50x  2012   Summer  MHxPC130223941       1   
 4          22  HarvardX    PH207x  2012     Fall  MHxPC130317399       0   
 
    explored  certified               final_cc_cname_DI  ... grade  \
 0         0          0                           India  ...   0.0   
 1         0          0                   United States  ...   0.0   
 2         0          0                   United States  ...   0.0   
 3         0          0  Other Middle East/Central Asia  ...   0.0   
 4         0          0                       Australia  ...   0.0   
 
   start_time_DI  last_event_DI nevents ndays_act  nplay_video  nchapters  \
 0    2012-07-24     2013-07-27      

In [5]:
# Dropping irrelevant or redundant columns
# Keeping columns related to the features of interest and dropping unnecessary ones
data_cleaned = data.drop(
    columns=["Unnamed: 0", "start_time_DI", "last_event_DI", "userid_DI", "incomplete_flag"]
)

# Handling missing values
# For categorical data, replace missing values with a placeholder or mode
data_cleaned["gender"].fillna("Unknown", inplace=True)

# Encoding categorical variables
categorical_columns = ["institute", "semester", "final_cc_cname_DI", "LoE_DI", "gender"]
data_encoded = pd.get_dummies(data_cleaned, columns=categorical_columns, drop_first=True)

# Display basic stats and a sample of the cleaned dataset
data_encoded.describe(), data_encoded.head()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_cleaned["gender"].fillna("Unknown", inplace=True)


(             year        viewed     explored     certified         grade  \
 count  10000.0000  10000.000000  10000.00000  10000.000000  10000.000000   
 mean    2012.3054      0.634500      0.12700      0.039300      0.045015   
 std        0.4606      0.481594      0.33299      0.194317      0.186466   
 min     2012.0000      0.000000      0.00000      0.000000      0.000000   
 25%     2012.0000      0.000000      0.00000      0.000000      0.000000   
 50%     2012.0000      1.000000      0.00000      0.000000      0.000000   
 75%     2013.0000      1.000000      0.00000      0.000000      0.000000   
 max     2013.0000      1.000000      1.00000      1.000000      1.000000   
 
             nevents     ndays_act   nplay_video     nchapters  nforum_posts  \
 count  10000.000000  10000.000000   10000.00000  10000.000000  10000.000000   
 mean     274.662400      5.359300  157314.20570      2.901700      0.019400   
 std     1151.748456     10.133601   79720.67733      4.607728   

In [6]:
# Group data by users and aggregate the courses they enrolled in
user_courses = data_cleaned.groupby("userid_DI")["course_id"].apply(list).reset_index()

# Display a sample of the aggregated data
user_courses.head()

KeyError: 'userid_DI'