In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.datasets import load_iris

# Exercise 1: Load Dataset
iris = load_iris()

# Convert to DataFrame
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target  # Add target variable (species)

# Exercise 2: View Top Rows
print("First 5 rows of the dataset:")
print(df.head())

# Exercise 3: Check Shape
print("\nShape of the dataset (rows, columns):")
print(df.shape)


First 5 rows of the dataset:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  

Shape of the dataset (rows, columns):
(150, 5)


Explanation of chunksize and DataFrame
DataFrame in Pandas:

A DataFrame is a two-dimensional, size-mutable, and potentially heterogeneous tabular data structure with labeled axes (rows and columns). It is one of the most important structures in the Pandas library for data analysis.

A DataFrame can hold data of different types (e.g., integers, floats, strings), and it is similar to a table or spreadsheet.

Common operations you can perform on a DataFrame include:

Indexing, slicing, and selecting subsets of data.

Summarizing data (mean, sum, count, etc.).

Handling missing values.

Grouping, merging, and reshaping data.

For example, in the code above, df is a DataFrame that contains the Iris dataset's features and the target variable.

chunksize in Pandas:

chunksize is an argument you can pass to functions like pandas.read_csv() or pandas.read_sql() when you're working with large datasets that might not fit into memory all at once.

When you specify a chunksize, Pandas reads the file in chunks (batches) of a specified number of rows. Instead of loading the entire file into memory, it loads the data in smaller, manageable pieces.

For example, if you're reading a CSV file with millions of rows, you might not want to load the entire file into memory at once. Instead, you can specify a chunksize to read and process the data in parts.

In [2]:
# Exercise 4: Print all column names
print("Column names:")
print(df.columns)

# Exercise 5: Data Info
print("\nData info (data types, non-null counts, etc.):")
df.info()

# Exercise 6: Summary Statistics
print("\nSummary statistics of the numerical columns:")
print(df.describe())

# Exercise 7: Missing Value Check
print("\nMissing values in each column:")
print(df.isnull().sum())

# Exercise 8: Select the first 10 rows using iloc
print("\nFirst 10 rows using iloc:")
print(df.iloc[:10])


Column names:
Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)', 'target'],
      dtype='object')

Data info (data types, non-null counts, etc.):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   target             150 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 6.0 KB

Summary statistics of the numerical columns:
       sepal length (cm)  sepal width (cm)  petal length (cm)  \
count         150.000000        150.000000         150.000000   
mean            5.843333          3.057333           3.758000   
std             0.828066          0.435866           1.765298   
min          

**Exercise 4: Print All Column Names**

To print the column names of the DataFrame, we can use the df.columns attribute. This will give us a list of the names of all the columns in the DataFrame.

**Exercise 5: Data Info**

The df.info() method provides a concise summary of the DataFrame, including the data types of each column, the number of non-null values, and the memory usage. This is useful for getting an overview of the dataset and understanding its structure.

**Exercise 6: Summary Statistics**

The df.describe() method generates summary statistics for numerical columns in the DataFrame, such as the count, mean, standard deviation, min, max, and quartiles (25%, 50%, 75%). This is helpful to get a quick statistical summary of the data.

**Exercise 7: Missing Value Check**

To check for missing values in the DataFrame, df.isnull().sum() can be used. This will return the number of missing (null) values in each column.

**Exercise 8: Select Rows Using iloc**

The iloc[] function is used to select rows by their integer index position. To select the first 10 rows, we use df.iloc[:10], which selects rows from position 0 to 9.

In [3]:
# Exercise 9: Select Columns (numeric features only)
numeric_features = df.iloc[:, :-1]  # All rows, all columns except the last one (target column)
print("\nNumeric feature columns:")
print(numeric_features.head())

# Exercise 10: Filter Data (petal length > 1.5)
filtered_data = df[df['petal length (cm)'] > 1.5]
print("\nFiltered data (petal length > 1.5):")
print(filtered_data.head())

# Exercise 11: Sort Data (sort by sepal length in descending order)
sorted_data = df.sort_values(by='sepal length (cm)', ascending=False)
print("\nData sorted by sepal length (descending):")
print(sorted_data.head())

# Exercise 12: Create New Column (petal_ratio)
df['petal_ratio'] = df['petal length (cm)'] / df['petal width (cm)']
print("\nData with new 'petal_ratio' column:")
print(df[['petal length (cm)', 'petal width (cm)', 'petal_ratio']].head())

# Exercise 13: Drop a Column ('petal_ratio')
df = df.drop(columns=['petal_ratio'])
print("\nData after dropping 'petal_ratio' column:")
print(df.head())

# Exercise 14: Convert Target to DataFrame (Separate target column)
y = df['target']  # This is now a separate Series
df = df.drop(columns=['target'])  # Remove the 'target' column from the main DataFrame
print("\nDataFrame without 'target' column and separate 'y' DataFrame:")
print(df.head())
print("\nSeparate 'y' DataFrame:")
print(y.head())



Numeric feature columns:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2

Filtered data (petal length > 1.5):
    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
5                 5.4               3.9                1.7               0.4   
11                4.8               3.4                1.6               0.2   
18                5.7               3.8                1.7               0.3   
20                5.4               3.4                1.7               0.2   
23                5.1               3.3                1.7               0.5   



**Exercise 9: Select Columns (Using iloc to select numeric feature columns)**

In this exercise, we'll use iloc to select only the numeric feature columns (i.e., the first four columns: 'sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)').

We'll do this by specifying the column indices in iloc to select the first four columns.

**Exercise 10: Filter Data (Filter rows where petal length > 1.5)**

We'll use boolean indexing to filter rows based on the condition where the 'petal length (cm)' is greater than 1.5.

**Exercise 11: Sort Data (Sort by sepal length in descending order)**

We'll use sort_values() to sort the dataset based on the 'sepal length (cm)' column in descending order.

**Exercise 12: Create New Column (petal_ratio)**

We'll create a new column called 'petal_ratio', which is the ratio of 'petal length (cm)' to 'petal width (cm)'.

**Exercise 13: Drop a Column**

After creating the new column, we'll drop it using drop().

**Exercise 14: Convert Target to DataFrame**

We'll convert the target column ('target') into a separate DataFrame and add it as a new column in the original DataFrame (df).

In [5]:
# Re-initialize df to ensure 'target' column is present for all exercises in this cell
# This is necessary because 'target' was dropped from df in the previous cell (Exercise 14).
import pandas as pd
from sklearn.datasets import load_iris
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target

# Exercise 15: Select X and y for ML
X = df.iloc[:, :-1]  # Features: all rows, all columns except the last one (target column)
y = df['target']     # Target: the 'target' column
print("\nFeatures (X):")
print(X.head())
print("\nTarget (y):")
print(y.head())

# Exercise 16: Correlation Matrix
correlation_matrix = df.iloc[:, :-1].corr()  # We exclude 'target' when calculating correlation
print("\nCorrelation Matrix of feature columns:")
print(correlation_matrix)

# Exercise 17: Unique Values Count
unique_classes = df['target'].unique()
print("\nUnique target classes:")
print(unique_classes)

# Exercise 18: GroupBy (Group data by target class and compute average petal length)
average_petal_length_by_class = df.groupby('target')['petal length (cm)'].mean()
print("\nAverage petal length by target class:")
print(average_petal_length_by_class)

# Exercise 19: Duplicate Check
duplicate_rows = df.duplicated().sum()  # Count of duplicate rows
print("\nCount of duplicate rows:", duplicate_rows)

# Remove duplicate rows
df_cleaned = df.drop_duplicates()
print("\nData after removing duplicates:")
print(df_cleaned.head())

# Exercise 20: Export Clean Data (Save the final DataFrame to CSV)
df_cleaned.to_csv('iris_cleaned.csv', index=False)
print("\nCleaned data saved to 'iris_cleaned.csv'")


Features (X):
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2

Target (y):
0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int64

Correlation Matrix of feature columns:
                   sepal length (cm)  sepal width (cm)  petal length (cm)  \
sepal length (cm)           1.000000         -0.117570           0.871754   
sepal width (cm)           -0.117570          1.000000          -0.428440   
petal length (cm)           0.871754         -0.428440           1.000000   
petal width (cm)            0.817941         -0.366126           0.962865   

                   petal wid

Exercise 15: Select X and y for ML (Using iloc to extract all features and target)

In this exercise, we'll use iloc to select all the feature columns (X) and the target column (y). We want to:

Extract the features as X (all columns except target).

Extract the target variable as y (only the target column).

Exercise 16: Correlation Matrix (Print correlation matrix of feature columns)

A correlation matrix helps to understand how the features relate to each other. We'll use .corr() to compute the pairwise correlation of the feature columns in the dataset.

Exercise 17: Unique Values Count (Print unique target classes)

We will use df['target'].unique() to display the unique values in the target column, which represents the classes of the Iris dataset (i.e., the species).

Exercise 18: GroupBy (Group data by target class and compute average petal length)

We'll use groupby() to group the data by the target column (which represents the species). After grouping, we'll calculate the mean of petal length for each species.

Exercise 19: Duplicate Check (Show count of duplicate rows and remove them)

We'll use df.duplicated() to identify duplicate rows, count them, and then remove those duplicates using df.drop_duplicates().

Exercise 20: Export Clean Data (Save the final DataFrame to CSV)

Finally, we'll export the cleaned DataFrame (without the target column) to a CSV file using the to_csv() method.

In [6]:
# Kaggle trying

import pandas as pd

# Load dataset
df = pd.read_csv("path_to_file.csv")

# Check the first few rows
print(df.head())

# Check data types and missing values
print(df.info())

# Summary statistics for numeric columns
print(df.describe())



# Check for missing values
print(df.isnull().sum())

# Example: Fill missing values with the mean (for numerical columns)
df.fillna(df.mean(), inplace=True)


# Convert categorical columns to numeric (if applicable)
df['protocol'] = df['protocol'].astype('category').cat.codes


df['attack_type'] = df['attack_type'].map({'Benign': 0, 'Malicious': 1})


from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
numeric_columns = ['source_port', 'destination_port', 'packet_size']  # example numerical columns
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])


from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = df.drop('attack_type', axis=1)  # features
y = df['attack_type']               # target

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Initialize Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

from sklearn.metrics import confusion_matrix

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

from sklearn.model_selection import GridSearchCV

# Define parameter grid for RandomForestClassifier
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters found by GridSearchCV
print("Best parameters:", grid_search.best_params_)

# Use the best model
best_model = grid_search.best_estimator_

import joblib

# Save the trained model to a file
joblib.dump(best_model, 'cybersecurity_model.pkl')

# Load the model for inference later
model = joblib.load('cybersecurity_model.pkl')



FileNotFoundError: [Errno 2] No such file or directory: 'path_to_file.csv'