In [1]:
import pandas as pd

df = pd.read_csv("E:/Projects/Drug-Abuse-Risk-Detection/data/raw/combined.csv")

print("Shape:", df.shape)
print("\nColumns:")
print(df.columns)

print("\nData Types:")
print(df.dtypes)

print("\nMissing Values:")
print(df.isnull().sum())

print("\nDuplicate Rows:", df.duplicated().sum())

print("\nBasic Statistics:")
print(df.describe(include='all'))


Shape: (63086, 12)

Columns:
Index(['S_no', 'Experimentation', 'Academic_Performance_Decline',
       'Social_Isolation', 'Financial_Issues',
       'Physical_Mental_Health_Problems', 'Legal_Consequences',
       'Relationship_Strain', 'Risk_Taking_Behavior', 'Withdrawal_Symptoms',
       'Denial_and_Resistance_to_Treatment', 'Addiction_Class'],
      dtype='object')

Data Types:
S_no                                   int64
Experimentation                       object
Academic_Performance_Decline          object
Social_Isolation                      object
Financial_Issues                      object
Physical_Mental_Health_Problems       object
Legal_Consequences                    object
Relationship_Strain                   object
Risk_Taking_Behavior                  object
Withdrawal_Symptoms                   object
Denial_and_Resistance_to_Treatment    object
Addiction_Class                       object
dtype: object

Missing Values:
S_no                                     0
Exp

In [2]:
#DROPPING USELESS COLUMN S_NO
df = df.drop(columns=["S_no"])
print("Column Dropped")

Column Dropped


In [3]:
#FINDING TARGET VARIABLE RESULTS
print(df["Addiction_Class"].value_counts())
print("\nPercentage Distribution")
print(df["Addiction_Class"].value_counts(normalize=True) * 100)

Addiction_Class
No     45375
Yes    17711
Name: count, dtype: int64

Percentage Distribution
Addiction_Class
No     71.925625
Yes    28.074375
Name: proportion, dtype: float64


In [4]:
#REPLACING CATEGORICAL TO BINARY VALUES 
df = df.replace({"Yes" : 1, "No": 0})
print(df.head())
print(type(df))
print(df.dtypes)

   Experimentation  Academic_Performance_Decline  Social_Isolation  \
0              1.0                           0.0               0.0   
1              0.0                           1.0               NaN   
2              0.0                           0.0               0.0   
3              1.0                           0.0               1.0   
4              1.0                           1.0               0.0   

   Financial_Issues  Physical_Mental_Health_Problems  Legal_Consequences  \
0               1.0                              0.0                 0.0   
1               1.0                              1.0                 1.0   
2               0.0                              0.0                 1.0   
3               1.0                              0.0                 1.0   
4               NaN                              0.0                 1.0   

   Relationship_Strain  Risk_Taking_Behavior  Withdrawal_Symptoms  \
0                  0.0                   1.0         

  df = df.replace({"Yes" : 1, "No": 0})


In [5]:
#CHECKING THE NUMBER OF MISSING VALUES
print("Total Missing Values per Column:\n")
print(df.isnull().sum())

print("\nTotal Missing Values in Dataset:", df.isnull().sum().sum())

Total Missing Values per Column:

Experimentation                       3207
Academic_Performance_Decline          3183
Social_Isolation                      3227
Financial_Issues                      3157
Physical_Mental_Health_Problems       3119
Legal_Consequences                    3262
Relationship_Strain                   3175
Risk_Taking_Behavior                  3190
Withdrawal_Symptoms                   3265
Denial_and_Resistance_to_Treatment    3107
Addiction_Class                          0
dtype: int64

Total Missing Values in Dataset: 31892


In [6]:
#CHECKING DATA CONSISTENCY (CHECKING UNIQUE VALUES AND CHECKING WHETHER DATA IS 0 AND 1 ONLY OR NOT)
for col in df.columns:
    print(col, df[col].unique())


Experimentation [ 1.  0. nan]
Academic_Performance_Decline [ 0.  1. nan]
Social_Isolation [ 0. nan  1.]
Financial_Issues [ 1.  0. nan]
Physical_Mental_Health_Problems [ 0.  1. nan]
Legal_Consequences [ 0.  1. nan]
Relationship_Strain [ 0.  1. nan]
Risk_Taking_Behavior [ 1.  0. nan]
Withdrawal_Symptoms [ 0.  1. nan]
Denial_and_Resistance_to_Treatment [ 0.  1. nan]
Addiction_Class [0 1]


In [7]:
#CHECKING DUPLICATE ROWS
print("Duplicate Rows:", df.duplicated().sum())

Duplicate Rows: 47968


In [8]:
#DROPPING DUPLICATE VALUES
df = df.drop_duplicates()
print("New Shape:", df.shape)

New Shape: (15118, 11)


In [9]:
#CHECKING MISSING DATA AFTER DROPPING DUPLICATE ROWS
print(df.isnull().sum())
print("Total Missing:", df.isnull().sum().sum())

Experimentation                       1893
Academic_Performance_Decline          1874
Social_Isolation                      1876
Financial_Issues                      1863
Physical_Mental_Health_Problems       1857
Legal_Consequences                    1957
Relationship_Strain                   1897
Risk_Taking_Behavior                  1898
Withdrawal_Symptoms                   1895
Denial_and_Resistance_to_Treatment    1855
Addiction_Class                          0
dtype: int64
Total Missing: 18865


In [10]:
for col in df.columns:
    if col != "Addiction_Class":
        df[col].fillna(df[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [11]:
print(df.isnull().sum())

Experimentation                       0
Academic_Performance_Decline          0
Social_Isolation                      0
Financial_Issues                      0
Physical_Mental_Health_Problems       0
Legal_Consequences                    0
Relationship_Strain                   0
Risk_Taking_Behavior                  0
Withdrawal_Symptoms                   0
Denial_and_Resistance_to_Treatment    0
Addiction_Class                       0
dtype: int64


In [12]:
df = df.astype(int)

In [13]:
print(df.dtypes)

Experimentation                       int64
Academic_Performance_Decline          int64
Social_Isolation                      int64
Financial_Issues                      int64
Physical_Mental_Health_Problems       int64
Legal_Consequences                    int64
Relationship_Strain                   int64
Risk_Taking_Behavior                  int64
Withdrawal_Symptoms                   int64
Denial_and_Resistance_to_Treatment    int64
Addiction_Class                       int64
dtype: object


In [14]:
df.to_csv("E:/Projects/Drug-Abuse-Risk-Detection/data/processed/processed_dataset.csv", index=False)

print("Processed dataset saved successfully.")


Processed dataset saved successfully.
