1. Handling Missing Data in Titanic Dataset
   - Task:Identify and handle missing values in the Titanic dataset. Experiment with different strategies such as mean/median imputation, mode imputation, and dropping rows/columns.
   - Dataset: Titanic Dataset


In [1]:
import pandas as pd

titanic = pd.read_csv('titanic.csv')

print("Initial dataset:")
print(titanic.head())

print("Missing values in each column:")
print(titanic.isnull().sum())

titanic['Age'].fillna(titanic['Age'].mean(), inplace=True)
titanic['Fare'].fillna(titanic['Fare'].median(), inplace=True)

titanic['Embarked'].fillna(titanic['Embarked'].mode()[0], inplace=True)

titanic.drop(columns=['Cabin'], inplace=True)

print("Dataset after handling missing values:")
print(titanic.head())

print("Missing values in each column after handling:")
print(titanic.isnull().sum())


Initial dataset:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   

2. Encoding Categorical Variables in a Car Evaluation Dataset
   - Task: Encode categorical variables in the Car Evaluation dataset using one-hot encoding and label encoding. Compare the results.
   - Dataset: Car Evaluation Dataset

In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

data = pd.read_csv('car_evaluation.csv')

print(data.head())
print(data.columns)

label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

print("\nLabel Encoded Data:")
print(data.head())

data_one_hot = pd.get_dummies(data)

print("\nOne-Hot Encoded Data:")
print(data_one_hot.head())

print("\nNumber of features after Label Encoding:", data.shape[1])
print("Number of features after One-Hot Encoding:", data_one_hot.shape[1])


3. Scaling Features in the Wine Quality Dataset
   - Task: Apply normalization and standardization to the features in the Wine Quality dataset. Analyze how scaling affects the distribution of data.
   - Dataset: Wine Quality Datast


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler

data = pd.read_csv('winequality-red.csv')

X = data.drop(columns='quality')
y = data['quality']

min_max_scaler = MinMaxScaler()
X_normalized = min_max_scaler.fit_transform(X)

standard_scaler = StandardScaler()
X_standardized = standard_scaler.fit_transform(X)

X_normalized_df = pd.DataFrame(X_normalized, columns=X.columns)
X_standardized_df = pd.DataFrame(X_standardized, columns=X.columns)

fig, axs = plt.subplots(3, len(X.columns), figsize=(20, 15))

for i, column in enumerate(X.columns):
    axs[0, i].hist(X[column], bins=20, color='blue', alpha=0.7)
    axs[0, i].set_title(f'Original {column}')
    
    axs[1, i].hist(X_normalized_df[column], bins=20, color='green', alpha=0.7)
    axs[1, i].set_title(f'Normalized {column}')
    
    axs[2, i].hist(X_standardized_df[column], bins=20, color='red', alpha=0.7)
    axs[2, i].set_title(f'Standardized {column}')

plt.tight_layout()
plt.show()



4. Handling Outliers in the Boston Housing Dataset
   - Task: Identify and handle outliers in the Boston Housing dataset using techniques like Z-score, IQR, and visualization methods.
   - Dataset: Boston Housing Dataet


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import zscore

from sklearn.datasets import load_boston
boston = load_boston()
data = pd.DataFrame(boston.data, columns=boston.feature_names)
data['MEDV'] = boston.target

z_scores = np.abs(zscore(data))
outliers_z = data[(z_scores > 3).any(axis=1)]

Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
outliers_iqr = data[((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))).any(axis=1)]
\
fig, axs = plt.subplots(2, len(data.columns)//2, figsize=(20, 10))
fig.suptitle('Boxplots for each feature')

for i, column in enumerate(data.columns):
    sns.boxplot(y=data[column], ax=axs[i//7, i%7])

plt.tight_layout()
plt.show()

data_no_outliers = data[~((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))).any(axis=1)]

print(f'Number of outliers detected by Z-score method: {outliers_z.shape[0]}')
print(f'Number of outliers detected by IQR method: {outliers_iqr.shape[0]}')
print(f'Number of data points after removing outliers using IQR method: {data_no_outliers.shape[0]}')


5. Data Imputation in the Retail Sales Dataset
   - Task: Handle missing values in the Retail Sales dataset using advanced imputation techniques like KNN imputation and MICE.
   - Dataset: Retail Sales Dataset

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from fancyimpute import IterativeImputer

data = pd.read_csv('retail_sales.csv')

print("Initial dataset with missing values:")
print(data.head())

knn_imputer = KNNImputer(n_neighbors=5)
data_knn_imputed = pd.DataFrame(knn_imputer.fit_transform(data), columns=data.columns)

print("\nDataset after KNN imputation:")
print(data_knn_imputed.head())

mice_imputer = IterativeImputer(max_iter=10, random_state=42)
data_mice_imputed = pd.DataFrame(mice_imputer.fit_transform(data), columns=data.columns)

print("\nDataset after MICE imputation:")
print(data_mice_imputed.head())


6. Feature Engineering in the Heart Disease Dataset
   - Task: Create new features from existing ones in the Heart Disease dataset, such as age groups, cholesterol levels, and more.
   - Dataset: Heart Disease Dataset


In [None]:
import pandas as pd
data = pd.read_csv('heart_disease.csv')

print("Initial dataset:")
print(data.head())

bins = [0, 29, 39, 49, 59, 69, 79, 89, 100]
labels = ['20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89', '90-100']
data['age_group'] = pd.cut(data['age'], bins=bins, labels=labels, right=False)

bins = [0, 200, 239, 500]
labels = ['Normal', 'Borderline High', 'High']
data['cholesterol_level'] = pd.cut(data['cholesterol'], bins=bins, labels=labels, right=False)

bins = [0, 120, 129, 139, 180, 300]
labels = ['Normal', 'Elevated', 'High Blood Pressure (Hypertension Stage 1)', 'High Blood Pressure (Hypertension Stage 2)', 'Hypertensive Crisis']
data['blood_pressure_category'] = pd.cut(data['trestbps'], bins=bins, labels=labels, right=False)

bins = [0, 100, 140, 160, 220]
labels = ['Low', 'Below Average', 'Above Average', 'High']
data['max_heart_rate_category'] = pd.cut(data['thalach'], bins=bins, labels=labels, right=False)

print("\nDataset after feature engineering:")
print(data.head())


7. Transforming Variables in the Bike Sharing Dataset
   - Task: Apply transformations like log, square root, and Box-Cox transformations to skewed variables in the Bike Sharing dataset.
   - Dataset: Bike Sharing Dataset


In [None]:
import pandas as pd
import numpy as np
from scipy.stats import boxcox
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv('bike_sharing.csv')

print("Initial dataset:")
print(data.head())

skewed_features = ['count', 'temp', 'atemp', 'humidity', 'windspeed']

data['count_log'] = np.log1p(data['count'])

data['count_sqrt'] = np.sqrt(data['count'])

data['count_boxcox'], fitted_lambda = boxcox(data['count'] + 1) 
fig, axes = plt.subplots(4, 2, figsize=(15, 20))

for i, feature in enumerate(skewed_features):
    sns.histplot(data[feature], bins=30, ax=axes[i, 0], kde=True)
    axes[i, 0].set_title(f'Distribution of {feature} (Original)')

    if feature == 'count':
        sns.histplot(data[f'{feature}_log'], bins=30, ax=axes[i, 1], kde=True)
        axes[i, 1].set_title(f'Distribution of {feature} (Log Transformed)')
    elif feature == 'count_sqrt':
        sns.histplot(data[f'{feature}_sqrt'], bins=30, ax=axes[i, 1], kde=True)
        axes[i, 1].set_title(f'Distribution of {feature} (Square Root Transformed)')
    elif feature == 'count_boxcox':
        sns.histplot(data[f'{feature}_boxcox'], bins=30, ax=axes[i, 1], kde=True)
        axes[i, 1].set_title(f'Distribution of {feature} (Box-Cox Transformed)')

plt.tight_layout()
plt.show()

print("\nDataset after transformations:")
print(data.head())


8. Feature Selection in the Diabetes Dataset
   - Task: Use techniques like correlation analysis, mutual information, and recursive feature elimination (RFE) to select important features in the Diabetes dataset.
   - Dataset: Diabetes Dataset


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif, RFE
from sklearn.linear_model import LogisticRegression
import seaborn as sns
import matplotlib.pyplot as plt

data = pd.read_csv('diabetes.csv')

print("Initial dataset:")
print(data.head())

X = data.drop(columns='Outcome')
y = data['Outcome']

correlation_matrix = data.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix')
plt.show()

threshold = 0.2
corr_features = correlation_matrix.index[abs(correlation_matrix['Outcome']) > threshold].tolist()
print(f'Features with correlation greater than {threshold}: {corr_features}')

mutual_info = mutual_info_classif(X, y)
mutual_info_series = pd.Series(mutual_info, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(10, 6))
mutual_info_series.plot(kind='bar')
plt.title('Mutual Information Scores')
plt.show()
print(f'Top features based on mutual information: {mutual_info_series.index.tolist()}')

model = LogisticRegression(max_iter=10000)
rfe = RFE(model, n_features_to_select=5)
fit = rfe.fit(X, y)
rfe_features = X.columns[fit.support_].tolist()
print(f'Selected features using RFE: {rfe_features}')

selected_features = {
    'Correlation': corr_features,
    'Mutual Information': mutual_info_series.index.tolist(),
    'RFE': rfe_features
}
print("\nSelected features from each method:")
for method, features in selected_features.items():
    print(f'{method}: {features}')


9. Dealing with Imbalanced Data in the Credit Card Fraud Detection Dataset
   - Task: Handle imbalanced data in the Credit Card Fraud Detection dataset using techniques like SMOTE, ADASYN, and undersampling.
   - Dataset: Credit Card Fraud Detection Dataset


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

data = pd.read_csv('creditcard.csv')

print("Initial dataset:")
print(data.head())

X = data.drop(columns='Class')
y = data['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

def evaluate_model(X_train, y_train, X_test, y_test):
    model = LogisticRegression(max_iter=10000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

print("Original dataset evaluation:")
evaluate_model(X_train, y_train, X_test, y_test)

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
print("SMOTE dataset evaluation:")
evaluate_model(X_train_smote, y_train_smote, X_test, y_test)

adasyn = ADASYN(random_state=42)
X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train, y_train)
print("ADASYN dataset evaluation:")
evaluate_model(X_train_adasyn, y_train_adasyn, X_test, y_test)

undersample = RandomUnderSampler(random_state=42)
X_train_undersample, y_train_undersample = undersample.fit_resample(X_train, y_train)
print("Undersampled dataset evaluation:")
evaluate_model(X_train_undersample, y_train_undersample, X_test, y_test)


10. Combining Multiple Datasets in the Movie Lens Dataset
    - Task: Combine and preprocess multiple related datasets from the Movie Lens dataset, such as ratings, user information, and movie metadata.
    - Dataset: Movie Lens Dataset

In [None]:
import pandas as pd

ratings = pd.read_csv('ratings.csv')
users = pd.read_csv('users.csv')
movies = pd.read_csv('movies.csv')

print("Ratings dataset:")
print(ratings.head())
print("Users dataset:")
print(users.head())
print("Movies dataset:")
print(movies.head())

ratings_users = pd.merge(ratings, users, on='userId')
complete_data = pd.merge(ratings_users, movies, on='movieId')

print("Combined dataset:")
print(complete_data.head())

complete_data.fillna(complete_data.mean(), inplace=True)

complete_data['gender'] = complete_data['gender'].astype('category').cat.codes
complete_data['occupation'] = complete_data['occupation'].astype('category').cat.codes

complete_data['timestamp'] = pd.to_datetime(complete_data['timestamp'], unit='s')

print("Preprocessed dataset:")
print(complete_data.head())

complete_data.to_csv('combined_movie_lens_data.csv', index=False)
