In [2]:
!pip install psycopg2-binary
!pip install pymysql



In [3]:
import psycopg2
print("psycopg2 installed successfully!")

psycopg2 installed successfully!


In [4]:
import pymysql
import pandas as pd

def fetch_data():
    """ Connects to MySQL, fetches data, and returns a Pandas DataFrame. """
    try:
        conn = pymysql.connect(
            host='eufmd-database-1.cqodkl4vazie.eu-north-1.rds.amazonaws.com',
            user='root',
            password='YOUR_PASSWORD',
            database='db_training',
            port=3306,
            cursorclass=pymysql.cursors.DictCursor
        )
        print("✅ Connected to database successfully!")

        # Execute query
        with conn.cursor() as cursor:
            query = "SELECT * FROM moodle_enrols;"
            cursor.execute(query)
            columns = [desc[0] for desc in cursor.description]  # Get column names
            rows = cursor.fetchall()  # Fetch all rows

        # Convert to DataFrame
        df = pd.DataFrame(rows, columns=columns)
        return df

    except pymysql.Error as e:
        print(f"❌ Error: {e}")
        return None

    finally:
        if conn:
            conn.close()
            print("✅ Database connection closed.")

# Run the script
if __name__ == "__main__":
    df = fetch_data()
    if df is not None:
        print(df.head())  # Print first 5 rows


✅ Connected to database successfully!
✅ Database connection closed.
   id  user_id  course_id  enrol_date completion_date  progress status
0   1       19         14  2013-08-12            None        50   open
1   2      155         14  2013-10-18            None        50   open
2   3       56         14  2013-08-12            None        50   open
3   4      156          3  2013-10-18            None        50   open
4   5       35         14  2013-08-12            None        50   open


In [5]:
df

Unnamed: 0,id,user_id,course_id,enrol_date,completion_date,progress,status
0,1,19,14,2013-08-12,,50,open
1,2,155,14,2013-10-18,,50,open
2,3,56,14,2013-08-12,,50,open
3,4,156,3,2013-10-18,,50,open
4,5,35,14,2013-08-12,,50,open
...,...,...,...,...,...,...,...
128128,141132,5099,466,2025-01-30,0000-00-00,50,open
128129,141133,12197,466,2025-01-30,0000-00-00,50,open
128130,141135,4599,466,2025-01-30,0000-00-00,50,open
128131,141137,11020,466,2025-01-30,0000-00-00,50,open


In [6]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128133 entries, 0 to 128132
Data columns (total 7 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   id               128133 non-null  int64 
 1   user_id          128133 non-null  int64 
 2   course_id        128133 non-null  int64 
 3   enrol_date       128133 non-null  object
 4   completion_date  34651 non-null   object
 5   progress         128133 non-null  int64 
 6   status           128133 non-null  object
dtypes: int64(4), object(3)
memory usage: 6.8+ MB
None


In [7]:
print(f"duplicates:{df.duplicated().sum()}")

duplicates:0


In [8]:
print(df["course_id"].unique()[:10])

[14  3 20 21  4  2 28 12 13 16]


In [9]:
print(df.describe())

                  id        user_id      course_id       progress
count  128133.000000  128133.000000  128133.000000  128133.000000
mean    65936.148572   17440.345118     194.122607      60.712697
std     39771.725504   18222.115558     172.025990      20.515269
min         1.000000       2.000000       2.000000      50.000000
25%     32035.000000    4241.000000      14.000000      50.000000
50%     64068.000000   10750.000000     164.000000      50.000000
75%     96101.000000   22434.000000     379.000000      50.000000
max    141138.000000   67680.000000     466.000000     100.000000


In [10]:
print(df["progress"].value_counts())

progress
50     100680
100     27453
Name: count, dtype: int64


In [11]:
print(df["completion_date"].value_counts())

completion_date
0000-00-00    7198
2024-07-31     607
2024-10-11     479
2024-12-20     398
2024-11-07     388
              ... 
2019-08-04       1
2019-07-17       1
2019-07-03       1
2022-11-26       1
2022-01-26       1
Name: count, Length: 2315, dtype: int64


## Data Preprocessing

In [12]:
print(df.isnull().sum())  # Count missing values in each column



id                     0
user_id                0
course_id              0
enrol_date             0
completion_date    93482
progress               0
status                 0
dtype: int64


In [13]:
# Fill in missing values with "missing completion date"
df["completion_date"].fillna("unknown completion date", inplace=True)

In [14]:
print(df["completion_date"].value_counts())

completion_date
unknown completion date    93482
0000-00-00                  7198
2024-07-31                   607
2024-10-11                   479
2024-12-20                   398
                           ...  
2022-11-25                     1
2018-09-28                     1
2018-09-13                     1
2019-06-03                     1
2022-01-26                     1
Name: count, Length: 2316, dtype: int64


In [15]:
print(df.isnull().sum())

id                 0
user_id            0
course_id          0
enrol_date         0
completion_date    0
progress           0
status             0
dtype: int64


In [16]:
df['completion_date'] = pd.to_datetime(df['completion_date'], errors='coerce')  # Convert to datetime

# Filter valid dates (non-null after conversion)
df_valid_dates = df[df['completion_date'].notna()]


  df['completion_date'] = pd.to_datetime(df['completion_date'], errors='coerce')  # Convert to datetime


In [17]:
df['completion_date'] = pd.to_datetime(df['completion_date'], format='%Y-%m-%d', errors='coerce')


In [18]:
df_valid_dates = df[~df['completion_date'].isin(['None', '00', '0000-00-00', ''])]  # Exclude invalid values


