In [None]:
# AnomaData Project: Exploratory Data Analysis

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('AnomaData.csv')

# Check for missing values
print(df.isnull().sum())

# Visualize the distribution of each feature
for col in df.columns[:-1]:  # exclude the target variable 'y'
    plt.figure(figsize=(8, 6))
    sns.histplot(df[col], bins='auto')
    plt.title(f'Distribution of {col}')
    plt.xticks(rotation=45)
    plt.show()

# Calculate summary statistics for each feature
print(df.describe())
print(df.info())
print(df.corr())

# Visualize the target variable 'y'
plt.figure(figsize=(8, 6))
sns.countplot(df['y'])
plt.title('Count of Anomalies and Non-Anomalies')
plt.show()

time    0
y       0
x1      0
x2      0
x3      0
       ..
x57     0
x58     0
x59     0
x60     0
y.1     0
Length: 62, dtype: int64


In [None]:
# Handle missing values
df.fillna(df.mean(), inplace=True)

# Remove duplicates
df.drop_duplicates(inplace=True)

# Normalize data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[['x1', 'x2', ...]] = scaler.fit_transform(df[['x1', 'x2', ...]])

# Encode categorical variables
df['x_cat'] = pd.get_dummies(df['x_cat'])

# Remove outliers
from scipy import stats
df = df[(np.abs(stats.zscore(df)) < 3).all(axis=1)]

In [None]:
# Extract date components
df['date'] = pd.to_datetime(df['date'])
df['day'] = df['date'].dt.day
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year

# Create interaction terms
df['x1_x2'] = df['x1'] * df['x2']

# Transform categorical variable
df['x_cat'] = pd.get_dummies(df['x_cat'])

# Calculate statistical moments
df['x_mean'] = df['x'].mean()
df['x_std'] = df['x'].std()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Define models
models = {
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(),
    'Support Vector Machine': SVC()
}

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop('y', axis=1), df['y'], test_size=0.2, random_state=42)

# Train models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"{name}: {accuracy_score(y_test, y_pred):.3f}")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Evaluate models
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"{name}:")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

In [None]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameter tuning space
param_grid = {
    'Random Forest': {'n_estimators': [100, 200, 300], 'max_depth': [None, 5, 10]},
    'Logistic Regression': {'C': [0.1, 1, 10], 'penalty': ['l1', 'l2']},
    'Support Vector Machine': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
}

# Perform grid search
for name, model in models.items():
    grid_search = GridSearchCV(model, param_grid[name], cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    print(f"{name}: Best params = {grid_search.best_params_}, Best score = {grid_search.best_score_}")

In [None]:
from flask import Flask, request, jsonify
import pickle

app = Flask(__name__)

# Load trained model
model = pickle.load(open('trained_model.pkl', 'rb'))

@app.route('/predict', methods=['POST'])
def predict():
    data = request.get_json()
    prediction = model.predict(data)
    return jsonify({'prediction': prediction.tolist()})

if __name__ == '__main__':
    app.run(debug=True)

In [None]:
curl -X POST -H "Content-Type: application/json" -d '{"x1": 1, "x2": 2}' http://localhost:5000/predict