In [50]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [51]:
# Load dataset
file_path = '/content/Time-Wasters on Social Media.csv'
dataframe = pd.read_csv(file_path)

# Drop the UserID column (not a feature)
dataframe = dataframe.drop(columns=['UserID'])

# Choose a target column (e.g., Addiction Level for demonstration)
target_column = 'Addiction Level'
X = dataframe.drop(columns=[target_column])  # Features
Y = dataframe[target_column]  # Target

# Convert non-numeric columns to numeric using LabelEncoder
label_cols = X.select_dtypes(include=['object']).columns
label_encoders = {col: LabelEncoder().fit(X[col].astype(str)) for col in label_cols}
for col, encoder in label_encoders.items():
    X[col] = encoder.transform(X[col].astype(str))

# Handle missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Handle missing values in target, if any
Y = Y.fillna(Y.mode()[0])  # Replace NaN in the target with the most frequent value


In [52]:
# Create the pipeline
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('lda', LinearDiscriminantAnalysis()))
model = Pipeline(estimators)

In [53]:
# evaluate pipeline
kfold = KFold(n_splits=10,shuffle=True, random_state=7)
results = cross_val_score(model, X, Y, cv=kfold)
print(f"Mean Accuracy: {results.mean():.4f}")


Mean Accuracy: 1.0000


In [54]:
# Import libraries
import warnings
from pandas import read_csv
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

In [55]:
# Suppress specific runtime warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Load dataset
file_path = '/content/Time-Wasters on Social Media.csv'
dataframe = read_csv(file_path)

# Drop the UserID column (not a feature)
if 'UserID' in dataframe.columns:
    dataframe = dataframe.drop(columns=['UserID'])

# Define target and features
target_column = 'Addiction Level'
X = dataframe.drop(columns=[target_column])
Y = dataframe[target_column]

# Convert non-numeric columns to numeric using LabelEncoder
label_cols = X.select_dtypes(include=['object']).columns
for col in label_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

# Handle missing values in features using SimpleImputer
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Handle missing values in target
if Y.isnull().any():
    Y.fillna(Y.mode()[0], inplace=True)  # Replace NaN in the target with the mode

# Define FeatureUnion
feature_union = FeatureUnion([
    ('pca', PCA(n_components=3)),         # Principal Component Analysis
    ('select_best', SelectKBest(k=6))    # Statistical feature selection
])


In [56]:
# Define the full pipeline
estimators = [
    ('feature_union', feature_union),    # Feature extraction
    ('logistic', LogisticRegression(solver='liblinear'))  # Logistic regression model
]
pipeline = Pipeline(estimators)

# Evaluate the pipeline using 10-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=7)
results = cross_val_score(pipeline, X, Y, cv=kfold)

In [57]:
# Print the mean accuracy
print(f"Mean Accuracy: {results.mean():.4f}")

Mean Accuracy: 0.7110
