In [None]:
# All required imports for API Requeest of data base
import requests
import os
import json
from urllib3.exceptions import InsecureRequestWarning
from urllib3 import disable_warnings

# All required imports for word extraction and analysis
import pandas as pd
from pandas import json_normalize
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# All required imports for data visualization
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

## 1. Get data from MongoDB Atlas Database

In [None]:
#mongoDB api-key muss vorher in der variables.env hinterlgegt werden

In [None]:
disable_warnings(InsecureRequestWarning)
api_key = os.getenv('API_KEY')

url = "https://eu-central-1.aws.data.mongodb-api.com/app/data-lkdyd/endpoint/data/v1/action/find"
payload = json.dumps({
    "collection": "wartung-log-ausfall",
    "database": "data-project",
    "dataSource": "Cluster0"
})
headers = {
  'Content-Type': 'application/json',
  'Access-Control-Request-Headers': '*',
  'api-key': api_key,
}
response = requests.request("POST", url, headers=headers, data=payload, verify=False)
print(response.text)

In [None]:
# Convert the response to a json object
json_data = json.loads(response.text)
json_data

In [None]:
# Get response data as a dataframe
df = pd.DataFrame(json_data)
df

In [None]:
# Use json_normalize to flatten the dictionaries into separate columns
df_normalized = json_normalize(df['documents'])

# Concatenate the normalized columns with the original DataFrame
df = pd.concat([df, df_normalized], axis=1)

# Drop the original 'documents' column if needed
df = df.drop('documents', axis=1)

# Print the resulting DataFrame
print(df)

Je nach Use Case hier noch JSON bearbeiten, bevor es in das DF überführt wird

In [None]:
# Lower all the text in the dataframe

df['LowText'] = df['LogMessage'].apply(lambda x: x.lower() if isinstance(x, str) else x)


## 2. Data Preparation

### 2.1 Text Preprocessing
- Tokenization
- Lemmatization
- Stemming

In [None]:
nltk.download('snowball_data')

In [None]:
#Stemming
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer("english")
print("Stemming:")
df['Stem'] = df['LowText'].apply(lambda x: ' '.join([snowball.stem(word) for word in str(x).split() if isinstance(x, str)]))
print(df['Stem'])

In [None]:
#download tokenzization data
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
# Tokenization
from nltk.tokenize import word_tokenize
print("Tokenization:")
df['Token'] = [word_tokenize(word) for word in df["Stem"]]
print(df['Token'])

In [None]:
# Lemmatization in english language
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print("--->Lemmatization:")
df['Lem'] = [' '.join([lemmatizer.lemmatize(wd) for wd in word]) for word in df['Token']]
print(df['Lem'])

### 2.2 Data Cleaning

In [None]:
# Check balance of ServiceOK and ServiceNotOK
print(df['ServiceOK'].value_counts())

In [None]:
# Use Smote to balance the data
from imblearn.over_sampling import SMOTE
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

# Define the resampling method
resampling = SMOTE(sampling_strategy='minority')



## 3. Analyse durchführen

Was müssen wir hier Analysieren? Reichen die Textdaten aus, oder müssen die Texte mit den anderen Hardware Daten verknüpft werden?

In [None]:
# Split the data into training and test sets
from sklearn.model_selection import train_test_split
#Daten splitten
X_train, X_test, y_train, y_test = train_test_split(df['Stem'], df['ServiceOK'],
test_size=0.2, random_state=0)


In [None]:
#Vectorization of the Data
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_train_vectors = tfidf_vectorizer.fit_transform(X_train)
tfidf_test_vectors = tfidf_vectorizer.transform(X_test)

In [None]:
# Use SMOTE to balance the data
from imblearn.over_sampling import SMOTE

# Define the resampling method
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(tfidf_train_vectors , y_train)

print('Original dataset shape %s' % Counter(y_train_resampled))

In [None]:
# Visualize the data

def plot_confusion_matrix(y_true, y_pred, title, labels):
    cm = confusion_matrix(y_true, y_pred)
    fig, ax = plt.subplots(figsize=(8, 8))
    ax = sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False, ax=ax)
    ax.set_xlabel('Predicted', fontsize=15)
    ax.set_ylabel('True', fontsize=15)
    ax.set_title(title, fontsize=15)
    ax.xaxis.set_ticklabels(labels)
    ax.yaxis.set_ticklabels(labels, rotation=0)
    plt.show()



### 3.1 Klassifikation - K-nearest Neighbors

In [None]:
# K-nearest neighbors
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_resampled, y_train_resampled)
knn_pred = knn.predict(tfidf_test_vectors)

In [None]:
# Classification report
print("K-nearest neighbors:\n", classification_report(y_test, knn_pred))


In [None]:
plot_confusion_matrix(y_test, knn_pred, "K-nearest neighbors", knn.classes_)

### 3.2 Klassifikation - Random Forest

In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=0)
rf.fit(tfidf_train_vectors, y_train)
rf_pred = rf.predict(tfidf_test_vectors)

In [None]:
# Classification report
print("Random Forest:\n", classification_report(y_test, rf_pred))

In [None]:
# Create Decision Tree for the prediction of the next value of ServiceOK
plot_confusion_matrix(y_test, rf_pred, "Random Forrest", rf.classes_)
