In [169]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, StratifiedKFold, StratifiedShuffleSplit, KFold
import random
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
import pandas as pd
import os
warnings.filterwarnings('ignore')

import re
import spacy
PUNCT_TO_REMOVE = string.punctuation
pd.options.mode.chained_assignment = None
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('punkt',quiet = True)
from nltk import word_tokenize
nltk.download('omw-1.4')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from cleantext import clean
from sklearn.metrics import classification_report





[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ananya_pramanik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ananya_pramanik/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ananya_pramanik/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/ananya_pramanik/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [170]:
df = pd.read_csv('data_PFMEA.csv')

In [171]:
def text_cleaning(text):
    #lower case
    
    text = text.lower()

    #Removal of HTML Tags
    html_pattern = re.compile('<.*?>')
    text = html_pattern.sub(r'', text)

    #Removal of URLs
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    text = url_pattern.sub(r'', text)

    #Removal of Punctuations
    text = text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

    #Removal of stopwords
    STOPWORDS = set(stopwords.words('english'))
    text = " ".join([word for word in str(text).split() if word not in STOPWORDS])

    #Lemmatization
    lemmatizer = WordNetLemmatizer()
    wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
    pos_tagged_text = nltk.pos_tag(text.split())
    text = " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

    text = text.replace('\d+', '')

    return text

In [158]:
##################### Train Model ###############

In [172]:
def train_model(df, split, text):
    print('start')
    # text cleaning function
    df["Task_Description"] = df["Task Description"].apply(lambda text: text_cleaning(text))

    df1 = df[df['PFMEA Potential Failure Mode'].isin(df['PFMEA Potential Failure Mode'].value_counts()[df['PFMEA Potential Failure Mode'].value_counts()<2].index)]
    df2 = df[df['PFMEA Potential Failure Mode'].isin(df['PFMEA Potential Failure Mode'].value_counts()[df['PFMEA Potential Failure Mode'].value_counts()>=2].index)]

    from sklearn.model_selection import train_test_split
    train, test = train_test_split(df2, test_size=split, stratify=df2['PFMEA Potential Failure Mode']) 
    train_df = train.append(df1, ignore_index=True)
    test_df = test

    X_train = train_df.Task_Description
    X_test = test_df.Task_Description
    y_train = train_df['PFMEA Potential Failure Mode']
    y_test = test_df['PFMEA Potential Failure Mode']

    # Create dictionary and transform to feature vectors.
    from sklearn.feature_extraction.text import CountVectorizer
    count_vector = CountVectorizer()
    X_train_counts = count_vector.fit_transform(X_train)

    # TF-IDF vectorize.
    from sklearn.feature_extraction.text import TfidfTransformer
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

    # Create model(naive bayes) and training. 
    from sklearn.naive_bayes import MultinomialNB
    clf = MultinomialNB().fit(X_train_tfidf, y_train)

    # Create test documents and vectorize.
    X_new_counts = count_vector.transform(X_test)
    X_new_tfidf = tfidf_transformer.transform(X_new_counts)

    # Execute prediction(classification).
    predicted = clf.predict(X_new_tfidf)


    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_test,predicted)
    print('Confusion Matrix\n')
    #importing accuracy_score, precision_score, recall_score, f1_score
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    print('\nAccuracy: {:.2f}\n'.format(accuracy_score(y_test, predicted)))
    print('F1 score:', f1_score(y_test, predicted,average='weighted'))
    print('Recall:', recall_score(y_test, predicted,
                                average='weighted'))
    print('Precision:', precision_score(y_test, predicted,
                                        average='weighted'))
    print('\n clasification report:\n', classification_report(y_test, predicted))

    # Create test documents and vectorize.
    input=[text]
    y_new_counts = count_vector.transform(input)
    y_new_tfidf = tfidf_transformer.transform(y_new_counts)
    test = clf.predict(y_new_tfidf)

    s = ''.join(test)


    
    return s


In [173]:
text = "Secure Instrument Panel"
output = train_model(df, 0.25, text)
print("Predicted PFMEA Potential Failure Mode analysis for text ", text, ' is ', output )

start
Confusion Matrix


Accuracy: 0.36

F1 score: 0.25595004915672037
Recall: 0.3616557734204793
Precision: 0.2639548905683947

 clasification report:
                                                precision    recall  f1-score   support

    Electrical component not seated / located       0.00      0.00      0.00         9
                   Fluid line pinched/damaged       0.00      0.00      0.00         2
                      Harness pinched/damaged       0.00      0.00      0.00         3
                        Improper surface prep       0.00      0.00      0.00        10
                    Incorrect amount of fluid       0.00      0.00      0.00         1
               Incorrect orientation/position       0.00      0.00      0.00        34
              Incorrect pressed fit alignment       0.00      0.00      0.00         3
           Insufficient/intermittent adhesive       0.00      0.00      0.00         1
                             Mishandled parts       0.00      0

In [None]:
from operator import indexOf


def sampeling(df, split):
    df1 = df[df['PFMEA Potential Failure Mode'].isin(df['PFMEA Potential Failure Mode'].value_counts()[df['PFMEA Potential Failure Mode'].value_counts()<2].index)]
    df2 = df[df['PFMEA Potential Failure Mode'].isin(df['PFMEA Potential Failure Mode'].value_counts()[df['PFMEA Potential Failure Mode'].value_counts()>=2].index)]

    from sklearn.model_selection import train_test_split
    train, test = train_test_split(df2, test_size=split, stratify=df2['PFMEA Potential Failure Mode']) 
    train_df = train.append(df1, ignore_index=True)
    test_df = test
    train_df.to_csv('train.csv', index = False)
    test_df.to_csv('test.csv', index = False)

    return "Train Test Split Done"



In [None]:
split = sampeling(df,0.25)