## Connection Issue Detection Project

### Import requirement libraries

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression

### Load data and convert it to standard form

In [16]:
Dataframe = pd.read_csv("dataset_corrected_updated.csv").dropna()
Dataframe['Date'] = pd.to_datetime(Dataframe['Date'], format='%m/%d/%Y').dt.strftime('%Y-%m-%d')
label_encoder = LabelEncoder()
Dataframe['city_encoded'] = label_encoder.fit_transform(Dataframe['City'])
Dataframe['Date'] = pd.to_datetime(Dataframe['Date'])
Dataframe['timestamp'] = Dataframe['Date'].apply(lambda x: x.timestamp())
# Dataframe

In [17]:
# Dataframe = pd.read_csv("Dataset_Connection_Issues.csv").dropna()
# Dataframe['Date'] = pd.to_datetime(Dataframe['Date'], format='%m/%d/%Y').dt.strftime('%Y-%m-%d')
# label_encoder = LabelEncoder()
# Dataframe['city_encoded'] = label_encoder.fit_transform(Dataframe['City'])
# Dataframe['Date'] = pd.to_datetime(Dataframe['Date'])
# Dataframe['timestamp'] = Dataframe['Date'].apply(lambda x: x.timestamp())
# Dataframe

### Define some thresholds to help in detecting

In [18]:
Thresholds = {
    "DNS_Response_Time": 500,
    "Response_Service_Disruptions": 2,
    "Anomaly_Upload_Download_Ratio": 1.5,
    "Consistency_External_Connections": 0.5,
    "Change_Traffic_Type": 10,
    "Latency_Variability": 50,
    "Changes_Network_Topology": 5,
    "Throttling_Traffic_Management": 3,
    "End_Point_Issues": 10,
    "Network_Jitter": 30,
    # "Historical_Data_Analysis": 0.6,
    "Server_Response_Time": 1000,
    "Correlation_Third_Party_Services": 0.7,
    # "Frequent_IP_Address_Changes": 10,
    # "Unplanned_Maintenance": 3,
    "Comparative_Subnet_Performance": 0.5,
    "Connection_Drops_Handovers": 5
}


### Define function for detecting

In [27]:
def detect_issues(row):
    issues = {feature: row[feature] for feature, threshold in Thresholds.items() if row[feature] > threshold}
    return (issues)
Dataframe['Issue'] = Dataframe.apply(detect_issues, axis=1)
Dataframe[['Date','City', 'Time', 'Issue']].head(1000)

Unnamed: 0,Date,City,Time,Issue
0,2023-09-01,Tehran,12:00:00 AM,"{'Response_Service_Disruptions': 7.0, 'Consist..."
1,2023-09-01,Tehran,12:10:00 AM,"{'Response_Service_Disruptions': 23.0, 'Consis..."
2,2023-09-01,Tehran,12:20:00 AM,"{'Response_Service_Disruptions': 50.0, 'Consis..."
3,2023-09-01,Tehran,12:30:00 AM,"{'Response_Service_Disruptions': 19.0, 'Consis..."
4,2023-09-01,Tehran,12:40:00 AM,"{'Response_Service_Disruptions': 69.0, 'Consis..."
...,...,...,...,...
995,2023-09-07,Tehran,9:50:00 PM,"{'Response_Service_Disruptions': 22.0, 'Consis..."
996,2023-09-07,Tehran,10:00:00 PM,"{'Response_Service_Disruptions': 86.0, 'Consis..."
997,2023-09-07,Tehran,10:10:00 PM,"{'Response_Service_Disruptions': 67.0, 'Consis..."
998,2023-09-07,Tehran,10:20:00 PM,"{'Response_Service_Disruptions': 22.0, 'Consis..."


In [20]:
Dataframe['Issue'] = Dataframe.apply(detect_issues , axis=1)
Dataframe['Label'] = Dataframe['Issue'].apply(lambda x:x if x is not None else 'None Issue')
Dataframe[['Date' , 'Time' , 'Issue' , 'Label']].head()

Unnamed: 0,Date,Time,Issue,Label
0,2023-09-01,12:00:00 AM,"{'Response_Service_Disruptions': 7.0, 'Consist...","{'Response_Service_Disruptions': 7.0, 'Consist..."
1,2023-09-01,12:10:00 AM,"{'Response_Service_Disruptions': 23.0, 'Consis...","{'Response_Service_Disruptions': 23.0, 'Consis..."
2,2023-09-01,12:20:00 AM,"{'Response_Service_Disruptions': 50.0, 'Consis...","{'Response_Service_Disruptions': 50.0, 'Consis..."
3,2023-09-01,12:30:00 AM,"{'Response_Service_Disruptions': 19.0, 'Consis...","{'Response_Service_Disruptions': 19.0, 'Consis..."
4,2023-09-01,12:40:00 AM,"{'Response_Service_Disruptions': 69.0, 'Consis...","{'Response_Service_Disruptions': 69.0, 'Consis..."


### Split data for train and test

In [12]:
X = Dataframe.drop(columns=['City', 'Date', 'Time', 'Issue','Label'])
y = Dataframe['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

((34434, 19), (8609, 19))

### Train and predict with model

In [None]:
# clf = RandomForestClassifier(n_estimators=100, random_state=42)
# clf.fit(X_train_filled, y_train)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train model
clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
clf.fit(X_train, y_train)
# Predict on the testing set

y_pred = clf.predict(X_test)
# Evaluate the model's performance 
accuracy = accuracy_score(y_test, y_pred)
# classification_rep = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

### Test the trained model

In [None]:
sample_data = X_test_filled.sample(500, random_state=30)

# Use the trained model to predict issues
predicted_issues = clf.predict(sample_data)

# Create a DataFrame to display the results
results_df = sample_data.copy()
results_df['Predicted_Issue'] = predicted_issues

# Use the reporting function for each predict ed issue
results_df['Report'] = results_df.index.map(lambda idx: report_previous_issue(Dataframe.loc[idx, 'Time']))

results_df[['city_encoded','Predicted_Issue', 'Report']]