<a href="https://colab.research.google.com/github/Amaan-del/Bharat-intern-Task-1/blob/main/celebtask.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
uploaded = files.upload()


Data Exploration and Preprocessing (20%)

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv("sensor_data.csv")

# Basic info
print("Dataset Info:")
print(df.info())

# Check for missing values
print("\nMissing Values:\n", df.isnull().sum())

# Summary statistics
print("\nDescriptive Statistics:\n", df.describe())


In [None]:
# Handling missing values (example: fill with mean)
df.fillna(df.mean(), inplace=True)

# Outlier removal using Z-score
from scipy.stats import zscore
z_scores = np.abs(zscore(df.select_dtypes(include=np.number)))
df = df[(z_scores < 3).all(axis=1)]


In [None]:
# Feature Engineering
df['sensor_mean'] = df.iloc[:, :-1].mean(axis=1)  # assuming last column is target

# Correlation Analysis
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title("Feature Correlation Heatmap")
plt.show()


Classical Models

In [None]:
# Prepare features and target
X = df.drop("anomaly", axis=1)  # target column name assumed
y = df["anomaly"]

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

classical_models = {
    "Logistic Regression": LogisticRegression(),
    "Support Vector Machine": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier()
}

print("Classical Model F1 Scores:")
for name, model in classical_models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    score = f1_score(y_test, preds)
    print(f"{name}: {score:.4f}")


In [None]:
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

advanced_models = {
    "Random Forest": RandomForestClassifier(),
    "XGBoost": xgb.XGBClassifier(eval_metric='logloss'),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0)
}

print("\nAdvanced Model F1 Scores:")
for name, model in advanced_models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    score = f1_score(y_test, preds)
    print(f"{name}: {score:.4f}")


 Model Evaluation

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, RocCurveDisplay

# Choose best model manually (based on F1-score) — assume XGBoost here
final_model = xgb.XGBClassifier(eval_metric='logloss')
final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)

# Evaluation
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# ROC Curve
RocCurveDisplay.from_estimator(final_model, X_test, y_test)
plt.title("ROC Curve - XGBoost")
plt.show()
