In [2]:
import pandas as pd
import numpy as np
import re
import ast

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
# nltk.download('stopwords',quiet=True)
# nltk.download('wordnet', quiet=True)
# nltk.download('punkt',quiet=True)

from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from scipy.sparse import csr_matrix

In [3]:
def data_pre_pre_processing(df:pd.DataFrame):
    df= (df[["wonum","description","ldtext","mats_assigned","wopriority","actstart"]]
         .drop_duplicates()
         .reset_index(drop=True)
    )
    df[["ldtext", "description"]] = df[["ldtext", "description"]].astype(str)
    df["actstart"] = pd.to_datetime(df["actstart"])

    un_wonums=(df[["wonum"]].drop_duplicates(keep=False))
    df=un_wonums.merge(df,on='wonum').reset_index(drop=True)

    return df

In [4]:
def text_pre_processing(text):

    # Remove numbers and punctuation
    clean_text = "".join([i for i in text if i.isalpha() or i.isspace()])
    # Remove exceess whitespace
    clean_text = re.sub(r'\s+', ' ', clean_text)
    # Transform to lower case
    clean_text = clean_text.lower()

    tokens = nltk.word_tokenize(clean_text)
    #Removestopwords and character-like words
    clean_tokens = [w for w in tokens if (not w in stopwords.words("english")) and (len(w)!=1)]

    # Lemmatizatize the words(not stemming as we will use doc2vec later on which captures the meaning of words, therefore stemming is not applicable in this case)
    wordnet_lemmatizer = WordNetLemmatizer()
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in clean_tokens]

    return lemm_text

data = pd.DataFrame({"text": ['u can !write up to ""3^*(%) 20GB to th^e current directory (/kaggle/working/) that gets preserved as output when you create a version using Save & Run All ']*int(10e1)})
data['clean_text'] = data['text'].apply(lambda x: text_pre_processing(x))
print(f'Original text:    {data["text"][0]}')
print(f'Transformed text:    {data["clean_text"][0]}')

Original text:    u can !write up to ""3^*(%) 20GB to th^e current directory (/kaggle/working/) that gets preserved as output when you create a version using Save & Run All 
Transformed text:    ['write', 'gb', 'current', 'directory', 'kaggleworking', 'get', 'preserved', 'output', 'create', 'version', 'using', 'save', 'run']


In [5]:
df = pd.read_csv(r"C:\UserData\z004n6cr\OneDrive - Siemens AG\Desktop\Hackathon\data\brake_workorders.csv")

df=data_pre_pre_processing(df)

df["clean_ldtext"] = df['ldtext'].apply(lambda x: text_pre_processing(x))

df["clean_description"] = df['description'].apply(lambda x: text_pre_processing(x))

df.to_csv(path_or_buf=r"C:\Users\z004n6cr\Desktop\TOPicks\Hackathon\data\processed_brake_workorders.csv",index=False)


df.head()

In [6]:
df = pd.read_csv(r"C:\UserData\z004n6cr\OneDrive - Siemens AG\Desktop\TOPicks\Hackathon\data\processed_text.csv")
df 

Unnamed: 0,wonum,description,ldtext,mats_assigned,wopriority,actstart,clean_ldtext,clean_description
0,TLW1731982,3 shoe collectors worn below limit - see long ...,410 Side B Bogie 2 410 Side A Bogie 2 412 Side...,,9.0,2020-08-19 06:12:56.000,"['side', 'b', 'bogie', 'side', 'bogie', 'side'...","['shoe', 'collector', 'worn', 'limit', 'see', ..."
1,1184236,Wheel flats / cavities coach 407,,,9.0,,['nan'],"['wheel', 'flat', 'cavity', 'coach']"
2,TLW1981350,Attach ALL Lock pages to the Work Order,,,9.0,2022-02-09 06:24:10.000,['nan'],"['attach', 'lock', 'page', 'work', 'order']"
3,TLW2134700,407 universal toilet is leaking air,,,9.0,2022-11-08 00:35:05.000,['nan'],"['universal', 'toilet', 'leaking', 'air']"
4,TLW1957547,Attachment to be completed and attached by ind...,,,9.0,2021-12-09 13:20:23.000,['nan'],"['attachment', 'completed', 'attached', 'indep..."
...,...,...,...,...,...,...,...,...
174800,TLW1528921,Interior Defects (See long),08/07/2019 700133 401133 F027 Fire\n ...,,3.0,2019-07-19 23:50:11.000,"['f', 'fire', 'extinguisher', 'seal', 'missing...","['interior', 'defect', 'see', 'long']"
174801,TLW1603806,07/12 - Toilet water leak - 402 - LOOU,700123 402 toilet leaking clean water over flo...,A2V00002135547,6.0,2019-12-07 07:19:55.000,"['toilet', 'leaking', 'clean', 'water', 'floor...","['toilet', 'water', 'leak', 'loou']"
174802,1175544,Washer bottles to check and fill if required,,,9.0,2017-10-21 18:11:35.000,['nan'],"['washer', 'bottle', 'check', 'fill', 'required']"
174803,TLW2029921,"Smoke detector functional test, 401 Saloon (PR...",Fire detected in saloon coach 401 resulted in ...,,6.0,,"['fire', 'detected', 'saloon', 'coach', 'resul...","['smoke', 'detector', 'functional', 'test', 's..."


In [7]:
pd.to_datetime(df["actstart"]).describe()

count                           162997
mean     2020-08-19 07:16:03.218881536
min                2006-06-29 22:30:00
25%                2018-09-28 16:00:00
50%                2020-09-11 06:09:54
75%                2022-09-02 18:45:15
max                2024-03-18 08:50:49
Name: actstart, dtype: object

In [8]:
def merge_text_cols(df: pd.DataFrame, col1:str , col2:str):
    
    for index, row in df.iterrows():
        lst1 = ast.literal_eval(row[col1])
        lst2 = ast.literal_eval(row[col2])
        for item in lst2:
            lst1.append(item)
        df.loc[index,f"{col1}_{col2}"] = str(lst1)
        
    return df

In [9]:
def gather_relevant_mats(columns, df):

    def a2v_extraction(text):

        if text != 'nan':
            # Remove exceess whitespace
            clean_text = re.sub(r'\s+', ' ', text)
            pattern = r'[!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~]'
            # Replace matched punctuation with a space
            clean_text = re.sub(pattern, ' ', clean_text)
            tokens = nltk.word_tokenize(clean_text)

            a2v_nums = [(w)
                        for w in tokens if w.startswith('A2V') and len(w) > 5]
        else:
            return ''

        return ",".join(list(set(a2v_nums)))

    mats_cols = []
    for col in columns:
        df[f'mats_from_{col}'] = df[col].apply(a2v_extraction)
        mats_cols.append(f'mats_from_{col}')

    df['all_relevant_mats'] = (df[mats_cols].astype(str).apply(','.join, axis=1)
                                            .apply(lambda x: list(set(x.split(','))))

                               )

    df = df.drop(columns=mats_cols)

    return df

def worklog_desc_san(text: str):

    # Remove numbers and punctuation
    clean_text = "".join([i for i in text if i.isalpha() or i.isspace()])
    # Remove exceess whitespace
    clean_text = re.sub(r'\s+', ' ', clean_text)
    # Transform to lower case
    clean_text = clean_text.lower()

    tokens = nltk.word_tokenize(clean_text)

    worklog_stopwords = ['complete', 'progress', 'carried ', 'attend', ]
    # Removestopwords and character-like words
    clean_tokens = [w for w in tokens if (not w in stopwords.words("english")) 
                                          and (len(w) != 1)
                                          and w not in worklog_stopwords ]
    

    # Lemmatizatize the words(not stemming as we will use doc2vec later on which captures the meaning of words, therefore stemming is not applicable in this case)
    wordnet_lemmatizer = WordNetLemmatizer()
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in clean_tokens]

In [4]:
import pandas as pd
pd.read_csv('C:\UserData\z004n6cr\OneDrive - Siemens AG\Desktop\TOPicks\Projects\Calipri renaming python\Calipri_BrakeDisc_Class350_sample.csv')

SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape (887382327.py, line 2)

In [10]:
def data_sanitising(df: pd.DataFrame):

    df["actstart"] = pd.to_datetime(df["actstart"])
    df["clean_ldtext_size"] = df["clean_ldtext"].apply(lambda x: len(x))
    df["clean_description_size"] = df["clean_description"].apply(lambda x: len(x))

    # Fix data quality
    df['clean_description'] = df['clean_description'].fillna('[]')
    df['clean_ldtext'] = df['clean_ldtext'].fillna('[]')
    df['ldtext'] = df['ldtext'].astype(str)
    df['description'] = df['description'].astype(str)
    
    df = df.replace({"['nan']": '[]', '[nan]': '[]'})
    df = df.drop(df[df["ldtext"].astype(str).apply(lambda x: x.startswith('[if gte mso 9]&'))].index,axis=0).reset_index(drop=True)

    #Merge clean_description and clean_ldtext
    df = merge_text_cols(df,"clean_description","clean_ldtext")
    cols = ['description', 'ldtext']
    df = gather_relevant_mats(cols, df) 
        
    return df
    

In [11]:
def find_exact_matches(df):
    
    df1 = df.groupby(["clean_description", "clean_ldtext"], dropna = False).count()
    df1 = (df1[df1["wonum"] > 1]["wonum"].reset_index(drop=False))
    df1 = df1.rename(columns={"wonum": "count"}).reset_index(drop=True)
    df1["group_id"] = df1.index 
    df = df.merge(df1,on= ["clean_description", "clean_ldtext"], how='left')
    
    group_map = (df.groupby("group_id", dropna=True)['wonum'].apply(lambda x: ",".join(x))
                 .reset_index()
                 .rename(columns={"wonum": "similar_clean_description_ldtext"})
                )
    df = df.merge(group_map, on=["group_id"], how='left')
    
    df["exact_match"] = np.where(df["group_id"].isna(),0,1)
    
    return df

In [12]:
df = data_sanitising(df)
df = find_exact_matches(df)
df.head(3)

Unnamed: 0,wonum,description,ldtext,mats_assigned,wopriority,actstart,clean_ldtext,clean_description,clean_ldtext_size,clean_description_size,clean_description_clean_ldtext,all_relevant_mats,count,group_id,similar_clean_description_ldtext,exact_match
0,TLW1731982,3 shoe collectors worn below limit - see long ...,410 Side B Bogie 2 410 Side A Bogie 2 412 Side...,,9.0,2020-08-19 06:12:56,"['side', 'b', 'bogie', 'side', 'bogie', 'side'...","['shoe', 'collector', 'worn', 'limit', 'see', ...",61,65,"['shoe', 'collector', 'worn', 'limit', 'see', ...",[],,,,0
1,1184236,Wheel flats / cavities coach 407,,,9.0,NaT,[],"['wheel', 'flat', 'cavity', 'coach']",7,36,"['wheel', 'flat', 'cavity', 'coach']",[],,,,0
2,TLW1981350,Attach ALL Lock pages to the Work Order,,,9.0,2022-02-09 06:24:10,[],"['attach', 'lock', 'page', 'work', 'order']",7,43,"['attach', 'lock', 'page', 'work', 'order']",[],1953.0,145.0,"TLW1981350,TLW2002082,TLW2442086,TLW1857151,TL...",1


In [13]:
# from thefuzz import fuzz
# from thefuzz import process

# #df0 =df.copy()
# df['clean_description'] = df['clean_description'].apply(lambda w: " ".join( ast.literal_eval(w)))


# def sim_wonum(df0, text):
#    return ",".join(df0[df0['clean_description'].apply(lambda w: fuzz.ratio(w, text)) > 75]["wonum"])

# df['wonum_fuzz_matches'] = df['clean_description'].apply(lambda w: sim_wonum(df, w))
# df.to_csv(r'clusters_df.csv', index=False)

# a = df.groupby(['wonum_fuzz_matches']).count().sort_values('wonum').reset_index()
# df.to_csv(r'clusters_df.csv', index=False)

# a[ (a['wonum'] != 1) & (a['group_id'] != 0) ]

In [14]:
pd.read_csv(r'clusters_df.csv')

Unnamed: 0,wonum,description,ldtext,mats_assigned,wopriority,actstart,clean_ldtext,clean_description,clean_ldtext_size,clean_description_size,clean_description_clean_ldtext,all_relevant_mats,count,group_id,similar_clean_description_ldtext,exact_match,wonum_fuzz_matches
0,TLW1731982,3 shoe collectors worn below limit - see long ...,410 Side B Bogie 2 410 Side A Bogie 2 412 Side...,,9.0,2020-08-19 06:12:56,"['side', 'b', 'bogie', 'side', 'bogie', 'side'...",shoe collector worn limit see long location,61,65,"['shoe', 'collector', 'worn', 'limit', 'see', ...",[''],,,,0,"TLW1731982,TLW1700023,1513965,1510001,TLW17101..."
1,1184236,Wheel flats / cavities coach 407,,,9.0,,[],wheel flat cavity coach,7,36,"['wheel', 'flat', 'cavity', 'coach']",[''],,,,0,1184236102041111959131184235
2,TLW1981350,Attach ALL Lock pages to the Work Order,,,9.0,2022-02-09 06:24:10,[],attach lock page work order,7,43,"['attach', 'lock', 'page', 'work', 'order']",[''],1953.0,145.0,"TLW1981350,TLW2002082,TLW2442086,TLW1857151,TL...",1,"TLW1981350,TLW2002082,TLW2442086,TLW1857151,TL..."
3,TLW2134700,407 universal toilet is leaking air,,,9.0,2022-11-08 00:35:05,[],universal toilet leaking air,7,41,"['universal', 'toilet', 'leaking', 'air']",[''],,,,0,"TLW2134700,TLW1517649,TLW2123511,TLW2190342,TL..."
4,TLW1957547,Attachment to be completed and attached by ind...,,,9.0,2021-12-09 13:20:23,[],attachment completed attached independent checker,7,65,"['attachment', 'completed', 'attached', 'indep...",[''],32.0,147.0,"TLW1957547,TLW1871236,TLW1908566,TLW1908125,TL...",1,"TLW1957547,TLW1871236,TLW1908566,TLW2445473,TL..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174786,TLW1528921,Interior Defects (See long),08/07/2019 700133 401133 F027 Fire\n ...,,3.0,2019-07-19 23:50:11,"['f', 'fire', 'extinguisher', 'seal', 'missing...",interior defect see long,138,37,"['interior', 'defect', 'see', 'long', 'f', 'fi...",[''],,,,0,"TLW1722819,TLW2378468,TLW2012051,TLW2222733,TL..."
174787,TLW1603806,07/12 - Toilet water leak - 402 - LOOU,700123 402 toilet leaking clean water over flo...,A2V00002135547,6.0,2019-12-07 07:19:55,"['toilet', 'leaking', 'clean', 'water', 'floor...",toilet water leak loou,65,35,"['toilet', 'water', 'leak', 'loou', 'toilet', ...",[''],,,,0,"TLW1618383,TLW2255160,TLW1658383,1395351,TLW23..."
174788,1175544,Washer bottles to check and fill if required,,,9.0,2017-10-21 18:11:35,[],washer bottle check fill required,7,49,"['washer', 'bottle', 'check', 'fill', 'required']",[''],71.0,6295.0,"1052580,1190302,1057093,1258563,1162121,126822...",1,"1052580,1190302,1057093,1258563,1062211,107158..."
174789,TLW2029921,"Smoke detector functional test, 401 Saloon (PR...",Fire detected in saloon coach 401 resulted in ...,,6.0,,"['fire', 'detected', 'saloon', 'coach', 'resul...",smoke detector functional test saloon pr,170,59,"['smoke', 'detector', 'functional', 'test', 's...",[''],,,,0,"1036166,TLW1979029,1158190,1304489,TLW2085816,..."


In [15]:
non_exact_ld_matches_df = df[(df["exact_match"] == 0)]
test_df = non_exact_ld_matches_df.copy()

non_exact_ld_matches_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 92979 entries, 0 to 174789
Data columns (total 16 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   wonum                             92979 non-null  object        
 1   description                       92979 non-null  object        
 2   ldtext                            92979 non-null  object        
 3   mats_assigned                     8058 non-null   object        
 4   wopriority                        89768 non-null  float64       
 5   actstart                          87724 non-null  datetime64[ns]
 6   clean_ldtext                      92979 non-null  object        
 7   clean_description                 92979 non-null  object        
 8   clean_ldtext_size                 92979 non-null  int64         
 9   clean_description_size            92979 non-null  int64         
 10  clean_description_clean_ldtext    92979 non-null  

In [16]:
def tf_idf_similarity_df(df:pd.DataFrame, col: str, vect_max_feats : int =500 ,n_splits: int = 10):
      
      if n_splits <= 0 or n_splits > len(df):
        raise ValueError(
            "n_splits should be a positive integer less than or equal to the length of the dataframe")

      q_sim_df = pd.DataFrame()

      corpus = [" ".join( (ast.literal_eval(text))) for text in df[col][0:int(len(df)/n_splits)]]
      vectorizer = TfidfVectorizer(ngram_range=(1,1), max_features = vect_max_feats)
      X = vectorizer.fit_transform(corpus).astype(np.float32)

      print(f'We have {len(corpus)} documents and {len(set(vectorizer.get_feature_names_out()))} unique words in our corpus.\n'
            f'Tf-idf matrix is a {X,X.dtype}') 

      
      sim_mat = cosine_similarity(X)#.astype(np.float32)
      q_sim_df = pd.DataFrame.sparse.from_spmatrix(csr_matrix(np.round(sim_mat.data, 2)))
      
      return q_sim_df


def get_cluster_info(des_mat: pd.DataFrame, df: pd.DataFrame, key_word: str, similarity: float):
    
      words = key_word.split(" ")
      # " ".join(text_pre_processing(key_word))
      matches = df[df["clean_description"].apply(lambda x: all(word in ast.literal_eval(x) for word in words))]

      if len(matches) != 0:
            print(f'Which description best matches what you are looking for?\n')
            print(matches['description'].head(50))
      else:
            print('No matches found in this dataset')
      
      key = input()

      return df.iloc[des_mat[des_mat[int(key)] > similarity].sort_values( int(key), ascending=False).index]


def get_similar_wonums(des_mat: pd.DataFrame, df: pd.DataFrame, index_key: int, similarity: float):

      matches = df.iloc[des_mat[des_mat[int(index_key)] > similarity].sort_values(int(index_key), ascending=False).index]
     
      if (len(matches) != 0) and (len(matches) != 1):
          print(f'Descriptions matched:\n')
          print(matches['description'].head(50))

      else:
          print('No matches found in this dataset')
          return None

      return ",".join(matches['wonum'])

In [17]:
non_exact_ld_matches_df = non_exact_ld_matches_df.reset_index()
n = len(non_exact_ld_matches_df)
non_exact_ld_matches_df1 = non_exact_ld_matches_df[0:int(n/2)].reset_index()
non_exact_ld_matches_df2 = non_exact_ld_matches_df[int(n/2):n].reset_index()

In [19]:
q_sim_df = tf_idf_similarity_df(df, col='clean_description', vect_max_feats=1000, n_splits=4)

We have 43697 documents and 1000 unique words in our corpus.
Tf-idf matrix is a (<43697x1000 sparse matrix of type '<class 'numpy.float32'>'
	with 195133 stored elements in Compressed Sparse Row format>, dtype('float32'))


In [24]:
get_cluster_info(q_sim_df, df, key_word='disc', similarity=0.75)

Which description best matches what you are looking for?

790       406 bogie 2 axle 2 brake disc damaged (see lon...
8942                   406 axle 2 unusual wear brake discs.
19454          411 Axle 3 brake disc chipped. Post Incident
20618       Uneven wear on brake discs - to be investigated
21877     Coach 411 Axle 2 to be replaced - Brake Disc C...
22221       investigate 402 axle 3 discs un-even brake wear
24112            egress glass to be replaced(see long disc)
24881        4 x egress glass broken see long disc for loc)
25509                   egress glass broken (see long disc)
30886                    Replace Coach 407 Hard Disc drives
33528         loss of Pneumatic  brake force(see long disc)
43192       Uneven wear on brake discs - to be investigated
52441     various saloon seat base covers required (see ...
52899     Coach 401 wheel 6 contact disc damage See long...
62197     LOM - 402 wheel 8 disc corroded & pad thicknes...
64414     402 Wheels 2 & 6 brake discs hav

Unnamed: 0,wonum,description,ldtext,mats_assigned,wopriority,actstart,clean_ldtext,clean_description,clean_ldtext_size,clean_description_size,clean_description_clean_ldtext,all_relevant_mats,count,group_id,similar_clean_description_ldtext,exact_match
8942,TLW1806598,406 axle 2 unusual wear brake discs.,Rust rings are appearing on the discs (see pic...,,9.0,2021-02-02 04:23:50,"['rust', 'ring', 'appearing', 'disc', 'see', '...","['axle', 'unusual', 'wear', 'brake', 'disc']",106,44,"['axle', 'unusual', 'wear', 'brake', 'disc', '...",[],,,,0
20618,TLW2352598,Uneven wear on brake discs - to be investigated,405 A2,,3.0,NaT,[],"['uneven', 'wear', 'brake', 'disc', 'investiga...",2,51,"['uneven', 'wear', 'brake', 'disc', 'investiga...",[],6.0,6164.0,"TLW2352598,TLW2352604,TLW2352590,TLW2352592,TL...",1
26460,TLW1859072,402 axle 2 uneven brake pad/disc wear LOM,No data available SR 7039 02/10/2023,,3.0,2021-05-22 03:19:00,"['data', 'available', 'sr']","['axle', 'uneven', 'brake', 'paddisc', 'wear',...",27,53,"['axle', 'uneven', 'brake', 'paddisc', 'wear',...",[],,,,0
43192,TLW2352604,Uneven wear on brake discs - to be investigated,406 A1,,3.0,NaT,[],"['uneven', 'wear', 'brake', 'disc', 'investiga...",2,51,"['uneven', 'wear', 'brake', 'disc', 'investiga...",[],6.0,6164.0,"TLW2352598,TLW2352604,TLW2352590,TLW2352592,TL...",1
22221,TLW1756449,investigate 402 axle 3 discs un-even brake wear,,,9.0,2020-10-10 22:00:50,[],"['investigate', 'axle', 'disc', 'uneven', 'bra...",7,58,"['investigate', 'axle', 'disc', 'uneven', 'bra...",[],,,,0
43235,TLW1819018,uneven wear and rust on brake discs-see long,Uneven wear on brake discs on: axles 411 (all)...,,,NaT,"['uneven', 'wear', 'brake', 'disc', 'axle', 'a...","['uneven', 'wear', 'rust', 'brake', 'discssee'...",59,55,"['uneven', 'wear', 'rust', 'brake', 'discssee'...",[],,,,0


In [43]:
get_similar_wonums(q_sim_df, df, index_key=0, similarity=0.75)

Descriptions matched:

0        3 shoe collectors worn below limit - see long ...
41341      2 collector shoes worn - see long for locations
3612         2 collector shoes worn below limit (see long)
9903           Collector shoes worn below limits. See Long
21558         Collector shoes worn below limits (see long)
7                        Collector shoes worn below limits
36351                     shoe collectors worn below limit
28200                    collector shoes worn below limits
27107                    Collector shoes worn below limits
22233                     Collector shoe worn below limits
20353                     Collector shoes worn below limit
11763                    Collector shoes worn below limits
9053     Multiple collector shoes worn below limits - s...
42278                      Shoe Collector worn below limit
31959                  Shoes worn beyond Limits (SEE LONG)
32697                  Shoes worn beyond limits (see long)
41122    Various collector shoes 

'TLW1731982,1513965,1453849,TLW2091693,TLW1937100,1480035,TLW2123576,TLW1894922,1137676,TLW1731328,TLW1858203,TLW1581675,TLW1603096,1129750,TLW1989409,TLW1937234,1424152,1510000,1415745,TLW1771264,TLW2293090'

In [None]:
###
# The below line to be optimised!

# df['tf_idf_wonum_match'] = None
# df['tf_idf_wonum_match'] = df.iloc[0:int(len(df)/3)].apply(lambda row: get_similar_wonums(q_sim_df,df, index_key=row.name, similarity=0.75), axis=1)
# df.to_csv('C:\Users\z004n6cr\Desktop\TOPicks\cluster_data')

###

In [None]:
lst1 = list({'1467459', 'TLW1658161', '1120495', '1442784', 'TLW2450579', 'TLW1700023', 'TLW1517639', '1287647', 'TLW2334770', 'TLW1553602', '1132487', 'TLW1624071', 'TLW1725808', 'TLW1649557', '1487142', 'TLW1711937', 'TLW1799816', 'TLW1599037', '1455503', 'TLW1578069', 'TLW2304034'}
          )
lst2 = list({'1252480', '1336085', 'TLW1614934', '1102660', '1212704', '1140301', 'TLW1735469', 'TLW1567987', 'TLW1998363', 'TLW1603863', 'TLW2134816', 'TLW1620446', 'TLW1645286', '1056788', 'TLW2366693', 'TLW1532594', '1255022'}
            )
df[df['wonum'].isin(lst1)]
#df.loc[0,"description"]

Unnamed: 0,wonum,description,ldtext,mats_assigned,wopriority,actstart,clean_ldtext,clean_description,clean_ldtext_size,clean_description_size,clean_description_clean_ldtext,all_relevant_mats,count,group_id,similar_clean_description_ldtext,exact_match
168,TLW1700023,Collector shoes below limit: See long description.,Collector shoes below limit: 401 Bogie 1. B side. 403 Bogie 1. B side. Bogie 2. B side 412 Bogie 1. A side.,,9.0,2020-06-09 03:08:00,"['collector', 'shoe', 'limit', 'bogie', 'b', 'side', 'bogie', 'b', 'side', 'bogie', 'b', 'side', 'bogie', 'side']","['collector', 'shoe', 'limit', 'see', 'long', 'description']",113,60,"['collector', 'shoe', 'limit', 'see', 'long', 'description', 'collector', 'shoe', 'limit', 'bogie', 'b', 'side', 'bogie', 'b', 'side', 'bogie', 'b', 'side', 'bogie', 'side']",[],,,,0
1085,TLW2450579,Multiple shoes worn bellow limit (see long),401 Bogie 2 A & B side 403 Bogie 1 B side 410 Bogie 2 B side,,3.0,2024-02-05 17:30:00,"['bogie', 'b', 'side', 'bogie', 'b', 'side', 'bogie', 'b', 'side']","['multiple', 'shoe', 'worn', 'bellow', 'limit', 'see', 'long']",66,62,"['multiple', 'shoe', 'worn', 'bellow', 'limit', 'see', 'long', 'bogie', 'b', 'side', 'bogie', 'b', 'side', 'bogie', 'b', 'side']",[],,,,0
1793,TLW1553602,8 x collector shoes below limits.,401 side A bogie 1 and 2 403 side A bogie 2 412 side A bogie 1 410 side A bogie 1 and 2 403 side B bogie 1 401 side B bogie 2,,9.0,2019-08-28 04:27:30,"['side', 'bogie', 'side', 'bogie', 'side', 'bogie', 'side', 'bogie', 'side', 'b', 'bogie', 'side', 'b', 'bogie']","['x', 'collector', 'shoe', 'limit']",112,35,"['x', 'collector', 'shoe', 'limit', 'side', 'bogie', 'side', 'bogie', 'side', 'bogie', 'side', 'bogie', 'side', 'b', 'bogie', 'side', 'b', 'bogie']",[],,,,0
2009,TLW1517639,Collector shoes to replace - see long,412 a 401 a 412 b,,9.0,2019-06-16 17:42:14,['b'],"['collector', 'shoe', 'replace', 'see', 'long']",5,47,"['collector', 'shoe', 'replace', 'see', 'long', 'b']",[],,,,0
3566,1455503,2 x Shoe Collector Shoe Worn,412 Bogie 1 side B 403 Bogie 1 Side B,,4.0,2019-03-10 01:10:26,"['bogie', 'side', 'b', 'bogie', 'side', 'b']","['x', 'shoe', 'collector', 'shoe', 'worn']",44,42,"['x', 'shoe', 'collector', 'shoe', 'worn', 'bogie', 'side', 'b', 'bogie', 'side', 'b']",[],,,,0
4347,TLW1658161,Collector shoes to replace (see long),The following shoes are to be replaced 403 Bogie 2 Aside 410 Bogie 1 Bside,,9.0,2020-03-14 05:50:00,"['following', 'shoe', 'replaced', 'bogie', 'aside', 'bogie', 'bside']","['collector', 'shoe', 'replace', 'see', 'long']",69,47,"['collector', 'shoe', 'replace', 'see', 'long', 'following', 'shoe', 'replaced', 'bogie', 'aside', 'bogie', 'bside']",[],,,,0
4679,TLW2304034,10 X shoes collector worn out,,,9.0,2023-07-17 16:20:20,[],"['x', 'shoe', 'collector', 'worn']",7,34,"['x', 'shoe', 'collector', 'worn']",[],,,,0
5183,1287647,7 x collector shoes below limits,,,9.0,2018-05-23 13:17:30,[],"['x', 'collector', 'shoe', 'limit']",7,35,"['x', 'collector', 'shoe', 'limit']",[],3.0,6549.0,"1287647,1298334,TLW1601824",1
5258,TLW1711937,Multiple shoes worn below limits,401 7-8 403 3-4 403 5-6 403 7-8 410 1-2 410 5-6,,3.0,2020-07-07 06:24:56,[],"['multiple', 'shoe', 'worn', 'limit']",2,37,"['multiple', 'shoe', 'worn', 'limit']",[],8.0,3235.0,"TLW1711937,TLW2355929,TLW1591525,TLW1780351,TLW2122753,TLW1922926,TLW1955542,TLW2386452",1
5887,TLW1649557,"COLLECTOR SHOES TO REPLACE, SEE LONG DESCRIPTION",COLLECTOR SHOES REQUIRED ON 401 SIDE B BOGIE 1 403 SIDE A BOGIE 1 412 SIDE B BOGIE 1 412 SIDE A BOGIE 2,,9.0,2020-02-25 08:20:00,"['collector', 'shoe', 'required', 'side', 'b', 'bogie', 'side', 'bogie', 'side', 'b', 'bogie', 'side', 'bogie']","['collector', 'shoe', 'replace', 'see', 'long', 'description']",111,62,"['collector', 'shoe', 'replace', 'see', 'long', 'description', 'collector', 'shoe', 'required', 'side', 'b', 'bogie', 'side', 'bogie', 'side', 'b', 'bogie', 'side', 'bogie']",[],,,,0


In [None]:
set(lst1)

{'1120495',
 '1132487',
 '1287647',
 '1442784',
 '1455503',
 '1467459',
 '1487142',
 'TLW1517639',
 'TLW1553602',
 'TLW1578069',
 'TLW1599037',
 'TLW1624071',
 'TLW1649557',
 'TLW1658161',
 'TLW1700023',
 'TLW1711937',
 'TLW1725808',
 'TLW1799816',
 'TLW2304034',
 'TLW2334770',
 'TLW2450579'}

In [None]:
df.loc[0,'description'],df[df["wonum"].isin(['1467459', 'TLW1658161', '1120495', '1442784', 'TLW2450579', 'TLW1700023', 'TLW1517639', 
                     '1287647', 'TLW2334770', 'TLW1553602', '1132487', 'TLW1624071', 'TLW1725808', 'TLW1649557', 
                     '1487142', 'TLW1711937', 'TLW1799816', 'TLW1599037', '1455503', 'TLW1578069', 'TLW2304034'
                     ])
                     ]

('3 shoe collectors worn below limit - see long for locations',
             wonum                                             description  \
 168    TLW1700023      Collector shoes below limit: See long description.   
 1085   TLW2450579             Multiple shoes worn bellow limit (see long)   
 1793   TLW1553602                       8 x collector shoes below limits.   
 2009   TLW1517639                   Collector shoes to replace - see long   
 3566      1455503                            2 x Shoe Collector Shoe Worn   
 4347   TLW1658161                   Collector shoes to replace (see long)   
 4679   TLW2304034                           10 X shoes collector worn out   
 5183      1287647                        7 x collector shoes below limits   
 5258   TLW1711937                        Multiple shoes worn below limits   
 5887   TLW1649557        COLLECTOR SHOES TO REPLACE, SEE LONG DESCRIPTION   
 5895   TLW1725808                   403 - Side B - 2x Collector shoe worn   


In [None]:
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

X = tf_idf_similarity_df(df, col='clean_description', vect_max_feats=1000, n_splits=10)

X_sparse = csr_matrix(X)
 
np.where(cosine_sim > 0.8)[0]



# Convert the first row to dense format and reshape
# X0_dense = X_sparse[0].toarray().reshape(1, -1)

# # Compute cosine similarity with all rows in the sparse matrix
# cosine_sim = cosine_similarity(X0_dense, X_sparse).flatten()

# # Find the indices where the cosine similarity is greater than 0.85
# indices = np.where(cosine_sim > 0.85)[0]

# # Extract the vectors in X for each index (in sparse format)
# selected_vectors_sparse = X_sparse[indices]

# # Optionally convert the selected vectors to dense format
# selected_vectors_dense = selected_vectors_sparse.toarray()

# print("Indices:", indices)
# print("Selected Vectors (Dense):")
# print(selected_vectors_dense)

We have 17479 documents and 1000 unique words in our corpus.
Tf-idf matrix is a (<17479x1000 sparse matrix of type '<class 'numpy.float16'>'
	with 78386 stored elements in Compressed Sparse Row format>, dtype('float16'))


array([0], dtype=int64)

In [None]:
print(cosine_similarity(X))

[[1.         0.         0.         ... 0.29947285 0.         0.        ]
 [0.         1.         0.         ... 0.         0.         0.        ]
 [0.         0.         1.         ... 0.         0.14503533 0.        ]
 ...
 [0.29947285 0.         0.         ... 1.         0.         0.        ]
 [0.         0.         0.14503533 ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Sample data for demonstration
X = np.array([
    [1, 0, 0],
    [0, 1, 0],
    [0, 0, 1],
    [1, 1, 0],
    [1, 1, 1],
    [0, 1, 1],
])

# Calculate the cosine similarity between X[0] and all rows in X
cosine_sim = cosine_similarity([X[0]], X)[0]

# Find the indices where the cosine similarity is greater than 0.85
indices = np.where(cosine_sim > 0.85)[0]

# Extract the vectors in X for each index
selected_vectors = X[indices]

print("Indices:", indices)
print("Selected Vectors:", selected_vectors)

Indices: [0]
Selected Vectors: [[1 0 0]]


In [None]:
des_mat.loc[des_mat[int(key)] > 0.5,int(key)].sort_values(ascending = False)

498      1.00
7151     0.74
23603    0.68
4359     0.63
18095    0.61
9202     0.59
2605     0.58
20521    0.57
28339    0.57
32247    0.56
30204    0.55
10197    0.53
Name: 498, dtype: Sparse[float64, 0]

In [44]:
def doc2vec(test_df: pd.DataFrame, text_col: str, vs: int, e: int, win: int, sim_thr: float):

    '''
    Input:  
    text_col: The column of the df which contains the tokenised clean text.
              This column should have the form '['side', 'b', 'bogie',...]' or '[]'
    vs: Vector size parameter of Doc2Vec
    e: Epochs parameter of Doc2Vec
    win: Window parameter of Doc2Vec

    Output:
    model: A Doc2Vec model trained on corpus provided with params provided and min_count=2
    
    '''
    corpus = test_df[text_col]
    tagged_data = [TaggedDocument(words=ast.literal_eval(doc), tags=[str(i)]) for i, doc in enumerate(corpus)]
    print(f'First couple of Tagged data: {tagged_data[0:2]}')
    print(f'Length of tagged data is {len(tagged_data)}')

    # Initialize Doc2Vec model
    model = Doc2Vec(vector_size=vs, min_count=2, epochs=e, window=win)

    # Build vocabulary
    model.build_vocab(tagged_data)

    # Train the model
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    print(f'Model with vec_size={model.vector_size}, window={model.window} and epochs={model.epochs} has been succefully trained based on {text_col}')
    # Get the document embedding vectors
    #document_vectors = [model.infer_vector(ast.literal_eval(doc)) for doc in corpus]


    test_df[f"similar_{text_col}_wonums"] = ""
    test_df[f"similar_{text_col}_index"] = ""

    # Initialize an empty list to store wonum values
    wonum_list = []
    similar_wonums = []
    index_list = []
    similar_index = []


    # Loop through each document in test_df
    for doc_id in range(len(test_df)):
        inferred_vector = model.infer_vector(tagged_data[doc_id].words)
        sims = model.dv.most_similar([inferred_vector], topn= int(len(tagged_data)/100))

        # Use boolean indexing to filter similar wonum values
        similar_wonums = [test_df.iloc[int(sim[0])]["wonum"] for sim in sims if (sim[1] > sim_thr)]  #and (int(sims[0][0]) !=  doc_id)
        similar_index = [ int(sim[0]) for sim in sims if (sim[1] > sim_thr)]  #and (int(sims[0][0]) !=  doc_id)
           
        # Append the list of similar wonum values to wonum_list
        wonum_list.append(similar_wonums if len(similar_wonums) != 1 else '')
        index_list.append(similar_index if len(similar_index) != 1 else '')

    # Assign the array to the new column in test_df
    test_df[f"similar_{text_col}_wonums"] = np.array([",".join(map(str, row)) for row in wonum_list ])
    test_df[f"similar_{text_col}_index"] = np.array([",".join(map(str, row)) for row in index_list ])
    
    return test_df

In [None]:
test_df.head(2)

Unnamed: 0,wonum,description,ldtext,mats_assigned,wopriority,actstart,clean_ldtext,clean_description,basic_clean_ldtext,basic_clean_description,clean_ldtext_size,clean_description_size,clean_description_clean_ldtext,count,group_id,similar_clean_description_ldtext,exact_match
0,TLW1731982,3 shoe collectors worn below limit - see long ...,410 Side B Bogie 2 410 Side A Bogie 2 412 Side...,,9.0,2020-08-19 06:12:56,"['side', 'b', 'bogie', 'side', 'bogie', 'side'...","['shoe', 'collector', 'worn', 'limit', 'see', ...","[side, bogie, side, bogie, side, bogie]","[shoe, collector, worn, limit, see, long, loca...",61,65,"['shoe', 'collector', 'worn', 'limit', 'see', ...",,,,0
1,1184236,Wheel flats / cavities coach 407,,,9.0,NaT,[],"['wheel', 'flat', 'cavity', 'coach']",[nan],"[wheel, flat, cavity, coach]",7,36,"['wheel', 'flat', 'cavity', 'coach']",,,,0


In [47]:
# test_df = doc2vec(test_df, text_col='clean_ldtext', vs=125, win=4, e=50, sim_thr=0.7)
test_df = doc2vec(test_df, text_col='clean_description', vs=125, win=4, e=50, sim_thr=0.7)
test_df[["description", "ldtext", "similar_clean_description_wonums", "similar_clean_description_index"]].head(3)

Unnamed: 0,description,ldtext,similar_clean_description_wonums,similar_clean_description_index
0,3 shoe collectors worn below limit - see long ...,410 Side B Bogie 2 410 Side A Bogie 2 412 Side...,"TLW1657356,TLW1731982,TLW1931793,TLW1521505,TL...","14146,0,43387,92165,66820,82146,63431,39507,26..."
1,Wheel flats / cavities coach 407,,"1106899,TLW2313248,TLW2015258,1106776,1452230,...","44747,21262,92891,81589,39707,37852,1,61889,31..."
3,407 universal toilet is leaking air,,"TLW2134700,TLW2402992,1083708,1288660",2895138081342151


In [46]:
test_df

Unnamed: 0,wonum,description,ldtext,mats_assigned,wopriority,actstart,clean_ldtext,clean_description,clean_ldtext_size,clean_description_size,clean_description_clean_ldtext,all_relevant_mats,count,group_id,similar_clean_description_ldtext,exact_match,similar_clean_description_wonums,similar_clean_description_index
0,TLW1731982,3 shoe collectors worn below limit - see long ...,410 Side B Bogie 2 410 Side A Bogie 2 412 Side...,,9.0,2020-08-19 06:12:56,"['side', 'b', 'bogie', 'side', 'bogie', 'side'...","['shoe', 'collector', 'worn', 'limit', 'see', ...",61,65,"['shoe', 'collector', 'worn', 'limit', 'see', ...",[],,,,0,"TLW1657356,TLW1731982,TLW1931793,TLW1521505,TL...","14146,0,43387,92165,66820,82146,63431,39507,26..."
1,1184236,Wheel flats / cavities coach 407,,,9.0,NaT,[],"['wheel', 'flat', 'cavity', 'coach']",7,36,"['wheel', 'flat', 'cavity', 'coach']",[],,,,0,"1106899,TLW2313248,TLW2015258,1106776,1452230,...","44747,21262,92891,81589,39707,37852,1,61889,31..."
3,TLW2134700,407 universal toilet is leaking air,,,9.0,2022-11-08 00:35:05,[],"['universal', 'toilet', 'leaking', 'air']",7,41,"['universal', 'toilet', 'leaking', 'air']",[],,,,0,"TLW2134700,TLW2402992,1083708,1288660",2895138081342151
7,1480035,Collector shoes worn below limits,"412, bogie 1, side B 401, bogie 1, side B",,3.0,2019-04-06 16:20:49,"['bogie', 'side', 'b', 'bogie', 'side', 'b']","['collector', 'shoe', 'worn', 'limit']",44,38,"['collector', 'shoe', 'worn', 'limit', 'bogie'...",[],,,,0,"1013990,TLW1567886,1129750,TLW2092353,1358920,...","40684,55304,22595,70584,45407,81352,89283,3064..."
8,1078831,Insructor seat ripped,Small rip in seat swab of instructor seat in c...,A2V00002423169,3.0,2017-02-15 14:00:00,"['small', 'rip', 'seat', 'swab', 'instructor',...","['insructor', 'seat', 'ripped']",61,31,"['insructor', 'seat', 'ripped', 'small', 'rip'...",[],,,,0,"TLW1862628,1093659,1078831,TLW1547167,TLW22841...",171093158845943022013250891687129345
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174782,TLW2413275,Automatic coupler heating bands wearing 401+41...,,A2V00002426101,4.0,2023-12-23 06:02:57,[],"['automatic', 'coupler', 'heating', 'band', 'w...",7,65,"['automatic', 'coupler', 'heating', 'band', 'w...",[],,,,0,,
174783,TLW1857689,Sugar Glass Missing on Coach 407 x 2 - Coach ...,,,5.0,2021-05-17 00:26:23,[],"['sugar', 'glass', 'missing', 'coach', 'x', 'c...",7,57,"['sugar', 'glass', 'missing', 'coach', 'x', 'c...",[],,,,0,"TLW1857689,TLW1652687,TLW1969828,1334769,TLW15...","92975,74612,4524,52167,25650,32391,39601,66738..."
174786,TLW1528921,Interior Defects (See long),08/07/2019 700133 401133 F027 Fire\n ...,,3.0,2019-07-19 23:50:11,"['f', 'fire', 'extinguisher', 'seal', 'missing...","['interior', 'defect', 'see', 'long']",138,37,"['interior', 'defect', 'see', 'long', 'f', 'fi...",[],,,,0,"TLW2222293,1367518,TLW1578690,TLW1614583,TLW22...","79822,80396,89933,8989,4850,63798,42271,73822,..."
174787,TLW1603806,07/12 - Toilet water leak - 402 - LOOU,700123 402 toilet leaking clean water over flo...,A2V00002135547,6.0,2019-12-07 07:19:55,"['toilet', 'leaking', 'clean', 'water', 'floor...","['toilet', 'water', 'leak', 'loou']",65,35,"['toilet', 'water', 'leak', 'loou', 'toilet', ...",[],,,,0,"1162464,TLW1767954,TLW1523132,TLW2406982,11661...","39817,92943,53008,61101,77328,83226,39272,5919..."


In [None]:
'''
Test for nearest matches:

for doc_id in range(0, 6):

    inferred_vector = model.infer_vector(tagged_data[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

    print('\nDocument ({}): «{}»'.format(
        doc_id, ' '.join(tagged_data[doc_id].words)))
    for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('THIRD-MOST', 3), ('FOURTH-MOST', 4), ('FIFTH-MOST', 5), ('LEAST', len(sims) - 1)]:
        # if (sims[index][1] > 0.85):
        print(u'%s %s: «%s»' % (label, sims[index], ' '.join(
            tagged_data[int(sims[index][0])].words)))
'''

"\nTest for nearest matches:\n\nfor doc_id in range(0, 6):\n\n    inferred_vector = model.infer_vector(tagged_data[doc_id].words)\n    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))\n\n    print('\nDocument ({}): «{}»'.format(\n        doc_id, ' '.join(tagged_data[doc_id].words)))\n    for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('THIRD-MOST', 3), ('FOURTH-MOST', 4), ('FIFTH-MOST', 5), ('LEAST', len(sims) - 1)]:\n        # if (sims[index][1] > 0.85):\n        print(u'%s %s: «%s»' % (label, sims[index], ' '.join(\n            tagged_data[int(sims[index][0])].words)))\n"

In [50]:
test_df[test_df["wonum"].isin((test_df["similar_clean_description_wonums"][0]).split(','))]

Unnamed: 0,wonum,description,ldtext,mats_assigned,wopriority,actstart,clean_ldtext,clean_description,clean_ldtext_size,clean_description_size,clean_description_clean_ldtext,all_relevant_mats,count,group_id,similar_clean_description_ldtext,exact_match,similar_clean_description_wonums,similar_clean_description_index
0,TLW1731982,3 shoe collectors worn below limit - see long ...,410 Side B Bogie 2 410 Side A Bogie 2 412 Side...,,9.0,2020-08-19 06:12:56,"['side', 'b', 'bogie', 'side', 'bogie', 'side'...","['shoe', 'collector', 'worn', 'limit', 'see', ...",61,65,"['shoe', 'collector', 'worn', 'limit', 'see', ...",[],,,,0,"TLW1657356,TLW1731982,TLW1931793,TLW1521505,TL...","14146,0,43387,92165,66820,82146,63431,39507,26..."
734,TLW2070281,3 priority sticker (see long for location),403 * 2 410 * 1,,3.0,2022-07-15 17:00:00,[],"['priority', 'sticker', 'see', 'long', 'locati...",2,50,"['priority', 'sticker', 'see', 'long', 'locati...",[],,,,0,"TLW2070281,TLW1627693,1510001,TLW1916831,15048...","394,33460,39507,26493,63431,48810,43387,66820,..."
2352,TLW1582240,10 soiled std class seat base fabrics - see lo...,All soiled bases have been flipped to aid iden...,,9.0,2019-10-23 05:50:52,"['soiled', 'base', 'flipped', 'aid', 'identifi...","['soiled', 'std', 'class', 'seat', 'base', 'fa...",191,79,"['soiled', 'std', 'class', 'seat', 'base', 'fa...",[],,,,0,"TLW1657356,TLW1567909,TLW1567470,TLW1612395,TL...","14146,67614,66301,82146,1216,0,92165,17107,264..."
14833,TLW1518897,2 snuffer boxes impact damage - see long for l...,Car 403 Side A Bogie 1 Car 401 Side A Bogie ...,,9.0,2019-06-19 12:10:00,"['car', 'side', 'bogie', 'car', 'side', 'bogie']","['snuffer', 'box', 'impact', 'damage', 'see', ...",48,65,"['snuffer', 'box', 'impact', 'damage', 'see', ...",[],,,,0,"TLW1518897,TLW1557676,TLW1541970,TLW2160638,TL...","7880,17107,90149,12971,71338,55468,26493,37428..."
15736,1316887,Longitudinal Buffer Caps loose at several loca...,Longitudinal Buffer Caps found loose on Exam a...,,9.0,2018-07-18 21:59:05,"['longitudinal', 'buffer', 'cap', 'found', 'lo...","['longitudinal', 'buffer', 'cap', 'loose', 'se...",134,65,"['longitudinal', 'buffer', 'cap', 'loose', 'se...",[],,,,0,"1316887,TLW1916831,TLW2276947,TLW2180478,TLW20...","8365,26493,11975,53560,394,33460,64181,55468,1..."
18068,TLW1740314,MPI's Required (SL for locations),411 Axle 3 410 axle 3 407 axle 3,,3.0,2020-10-30 15:29:48,"['axle', 'axle', 'axle']","['mpis', 'required', 'sl', 'location']",24,38,"['mpis', 'required', 'sl', 'location', 'axle',...",[],,,,0,"TLW1740314,TLW2031773,TLW2482314,TLW1792939,10...","9619,11258,34564,39166,23526,17450,15290,32664..."
18355,TLW2099990,Power supply - Shoegear collector shoe - Repl...,401 B2 Side B 403 B1 & B2 Side B 412 B1 & B2...,"A2V00001439493,A2V00001922252,GB8:GSS00002",3.0,2022-09-07 20:14:00,"['b', 'side', 'b', 'b', 'b', 'side', 'b', 'b',...","['power', 'supply', 'shoegear', 'collector', '...",85,90,"['power', 'supply', 'shoegear', 'collector', '...",[],,,,0,"TLW2099990,TLW2156489,TLW1627693,TLW1959107,13...","9767,53383,33460,47079,52907,66820,63431,7880,..."
22522,TLW2276947,SANDER BLOCKAGE H LOCATION,,,9.0,2023-06-07 13:15:00,[],"['sander', 'blockage', 'h', 'location']",7,39,"['sander', 'blockage', 'h', 'location']",[],,,,0,"TLW2276947,TLW1916831,TLW2180478,TLW2160638,TL...","11975,26493,53560,12971,48810,35948,71338,3346..."
24359,TLW2160638,Gearbox oil level low (see long for location),401 axle 2 412 Axle 1 & 2,,9.0,2022-12-19 23:30:00,"['axle', 'axle']","['gearbox', 'oil', 'level', 'low', 'see', 'lon...",16,61,"['gearbox', 'oil', 'level', 'low', 'see', 'lon...",[],,,,0,"TLW1557676,TLW1541970,TLW1518897,TLW2160638,TL...","17107,90149,7880,12971,71338,26493,89305,53383..."
26486,TLW1657356,4 std class seat fabrics soiled - see long for...,car 409 - by door A2 car 409 - mid car car 405...,,9.0,2020-03-12 20:30:00,"['car', 'door', 'car', 'mid', 'car', 'car', 'd...","['std', 'class', 'seat', 'fabric', 'soiled', '...",65,71,"['std', 'class', 'seat', 'fabric', 'soiled', '...",[],,,,0,"TLW1916831,TLW1567909,TLW1657356,TLW2180478,TL...","26493,67614,14146,53560,66301,0,11975,12971,48..."


In [None]:
test_df[["wonum","description", "ldtext", "similar_clean_ldtext_wonums", "similar_clean_ldtext_index", "similar_clean_description_wonums", "similar_clean_description_index"]].head(1)

Unnamed: 0,wonum,description,ldtext,similar_clean_ldtext_wonums,similar_clean_ldtext_index,similar_clean_description_wonums,similar_clean_description_index
0,TLW1731982,3 shoe collectors worn below limit - see long ...,410 Side B Bogie 2 410 Side A Bogie 2 412 Side...,"TLW1778223,TLW1693247,1150654,TLW2216155,14585...","56825,72657,48828,53669,89783,86064,29310,3936...","TLW1731982,TLW1612395,TLW1916831,TLW1541970,TL...","0,82146,26493,90149,73444,12971,49259,63431,66..."


In [None]:
pd.options.display.width = 1000 
a=1
for ite in (test_df["similar_clean_ldtext_index"][0]).split(','):
    if(ite == (test_df[test_df["wonum"]=='TLW1778223'].index[0]) ):
        a=a*0
print

1


In [None]:
test_df["similar_clean_ldtext_index"][0].split(',')

['56825',
 '72657',
 '48828',
 '53669',
 '89783',
 '86064',
 '29310',
 '39368',
 '15068',
 '23722',
 '13850',
 '51660',
 '3055',
 '38172',
 '65589',
 '9767',
 '78310',
 '74587',
 '81409',
 '21155',
 '79152',
 '50031',
 '86157',
 '60115',
 '38401',
 '87737',
 '87509',
 '64548',
 '87845',
 '86945',
 '66662',
 '80473',
 '72109',
 '62408',
 '28072',
 '9578',
 '84573',
 '57993',
 '82377',
 '44331',
 '57843',
 '92068',
 '22523',
 '83821',
 '76671',
 '22905',
 '62110',
 '91990',
 '45817',
 '84864',
 '79389',
 '41548',
 '53058',
 '69110',
 '926',
 '55987',
 '78268',
 '47456',
 '85185',
 '81713',
 '49826',
 '27411',
 '63776',
 '44588',
 '77998',
 '43832',
 '74878',
 '86982',
 '1462',
 '49012',
 '90744',
 '57838',
 '80993',
 '43011',
 '72986',
 '60290',
 '18593',
 '3',
 '66100',
 '38753',
 '40579',
 '53090',
 '28892',
 '48166',
 '87778',
 '85813',
 '81314',
 '55112',
 '81742',
 '33178',
 '86533',
 '69036',
 '6408',
 '22199',
 '72667',
 '84979',
 '68945',
 '72304',
 '25324',
 '65547',
 '10725',
 

In [None]:
test_df[test_df["wonum"] == 'TLW1778223']['similar_clean_ldtext_index']


106758    39368,3055,71771,29310,81742,61196,39017,86064...
Name: similar_clean_ldtext_index, dtype: object

In [None]:
# Create a tuple containing two sets of values extracted from columns of the DataFrame
set12 = (test_df["similar_clean_ldtext_wonums"].apply(lambda x: set(x.split(","))),
         test_df["similar_clean_description_wonums"].apply(lambda x: set(x.split(","))))

 
# Iterate over the sets simultaneously using zip and assign the intersection to the new column
for idx, (item1, item2) in enumerate(zip(*set12)):
    test_df.at[idx, "similar_clean_description_ldtext"] = item1.intersection(item2)

test_df[["similar_clean_description_ldtext",
         "similar_clean_ldtext_wonums", "similar_clean_description_ldtext"]]
# CHECK THE CASES WHERE AN INTESECTION EXISTS! : test_df[test_df["similar_clean_description_ldtext"] != set()]

#Assign back to df the non exact m,atch df we train our doc2vec
df = pd.concat([test_df, (df[(df["exact_match"] != 0)])],axis=0)
df 

ValueError: Must have equal len keys and value when setting with an iterable

In [None]:
def convert_col_to_str(df: pd.DataFrame, col):
    df[col] = df[col].apply(lambda x: ",".join(x) if type(x) == set else x).astype(str)

    return df

In [None]:
df = convert_col_to_str(df, ['similar_clean_description_ldtext', 'similar_clean_description_wonums', 'similar_clean_ldtext_wonums'])
df

Unnamed: 0,wonum,description,ldtext,mats_assigned,wopriority,actstart,clean_ldtext,clean_description,basic_clean_ldtext,basic_clean_description,clean_ldtext_size,clean_description_size,clean_description_clean_ldtext,count,group_id,similar_clean_description_ldtext,exact_match,similar_clean_ldtext_wonums,similar_clean_description_wonums
0,TLW1731982,3 shoe collectors worn below limit - see long ...,410 Side B Bogie 2 410 Side A Bogie 2 412 Side...,,9.0,2020-08-19 06:12:56,"['side', 'b', 'bogie', 'side', 'bogie', 'side'...","['shoe', 'collector', 'worn', 'limit', 'see', ...","[side, b, bogie, side, bogie, side, b, bogie]","[shoe, collector, worn, limit, see, long, loca...",61,65,"['shoe', 'collector', 'worn', 'limit', 'see', ...",,,{'TLW1612395'},0,"TLW1778223,1404265,1275277,1147163,1224610,TLW...","TLW1612395,TLW1931793,1490152,TLW2180478,TLW17..."
1,1184236,Wheel flats / cavities coach 407,,,9.0,NaT,[],"['wheel', 'flat', 'cavity', 'coach']",[nan],"[wheel, flat, cavity, coach]",7,36,"['wheel', 'flat', 'cavity', 'coach']",,,{''},0,,
2,TLW2134700,407 universal toilet is leaking air,,,9.0,2022-11-08 00:35:05,[],"['universal', 'toilet', 'leaking', 'air']",[nan],"[universal, toilet, leaking, air]",7,41,"['universal', 'toilet', 'leaking', 'air']",,,set(),0,,"TLW2134700,TLW2402992"
3,1480035,Collector shoes worn below limits,"412, bogie 1, side B 401, bogie 1, side B",,3.0,2019-04-06 16:20:49,"['bogie', 'side', 'b', 'bogie', 'side', 'b']","['collector', 'shoe', 'worn', 'limit']","[bogie, side, b, bogie, side, b]","[collector, shoe, worn, limit]",44,38,"['collector', 'shoe', 'worn', 'limit', 'bogie'...",,,{'TLW2244150'},0,"1160943,TLW1813377,TLW1885089,TLW1805814,14164...","TLW1817064,1086563,TLW1802036,1480035,TLW19013..."
4,1078831,Insructor seat ripped,Small rip in seat swab of instructor seat in c...,A2V00002423169,3.0,2017-02-15 14:00:00,"['small', 'rip', 'seat', 'swab', 'instructor',...","['insructor', 'seat', 'ripped']","[small, rip, seat, swab, instructor, seat, cab]","[insructor, seat, ripped]",61,31,"['insructor', 'seat', 'ripped', 'small', 'rip'...",,,{''},0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174781,TLW1983783,Supervisor Confirms no locks active at both SPCS,,,9.0,2022-02-14 12:29:11,[],"['supervisor', 'confirms', 'lock', 'active', '...",[nan],"[supervisor, confirms, lock, active, spcs]",7,52,"['supervisor', 'confirms', 'lock', 'active', '...",1958.0,5407.0,"TLW2277228,TLW2303835,TLW2470916,TLW1864037,TL...",1,,
174784,TLW2037440,Check for software locks using maintenance laptop,,,9.0,2022-05-21 11:48:45,[],"['check', 'software', 'lock', 'using', 'mainte...",[nan],"[check, software, lock, using, maintenance, la...",7,63,"['check', 'software', 'lock', 'using', 'mainte...",3329.0,1031.0,"TLW2266293,TLW2128026,TLW2353019,TLW2377059,TL...",1,,
174785,TLW1880743,Carry out snuffer box lid bolt torque and tab ...,Carry out inspection of the Snuffer Box Lid Bo...,,3.0,2021-07-22 03:30:04,"['carry', 'inspection', 'snuffer', 'box', 'lid...","['carry', 'snuffer', 'box', 'lid', 'bolt', 'to...","[carry, inspection, snuffer, box, lid, bolt, c...","[carry, snuffer, box, lid, bolt, torque, tab, ...",730,130,"['carry', 'snuffer', 'box', 'lid', 'bolt', 'to...",82.0,862.0,"TLW1880780,TLW1880975,TLW1880764,TLW1880827,TL...",1,,
174788,1175544,Washer bottles to check and fill if required,,,9.0,2017-10-21 18:11:35,[],"['washer', 'bottle', 'check', 'fill', 'required']",[nan],"[washer, bottle, check, fill, required]",7,49,"['washer', 'bottle', 'check', 'fill', 'required']",71.0,6295.0,"1052580,1190302,1057093,1258563,1162121,126822...",1,,


In [None]:
test_df['mats_assigned'].apply(lambda x : (str(x).split(',')))

corpus =  test_df['mats_assigned'].fillna('')
vectorizer = CountVectorizer(stop_words=None,
                            lowercase=False,
                            token_pattern=r"(?u)[^,]+")

X = vectorizer.fit_transform(corpus)
X,len( (vectorizer.get_feature_names_out())) 
np.array(X[4])

array(<1x1674 sparse matrix of type '<class 'numpy.int64'>'
	with 1 stored elements in Compressed Sparse Row format>, dtype=object)

In [None]:
test_df[~test_df['mats_assigned'].isna()].head(2)

NameError: name 'test_df' is not defined

In [None]:
set1 = set((vectorizer.get_feature_names_out()))

set2 = set([item for sublist in [str(x).split(',')
           for x in test_df['mats_assigned']] for item in sublist if item != 'nan'])
len(list(set1))
set1.difference(set2)

assert set1.difference(set2)==set()

In [None]:
test_df[["similar_clean_ldtext_wonums","similar_clean_description_wonums","intersection_column"]]
len(test_df[test_df["intersection_column"] != set()])/len(test_df)
test_df[(test_df["intersection_column"] != set()) & (test_df["intersection_column"].apply(
    lambda x: len(x)) >1)][["similar_clean_ldtext_wonums", "similar_clean_description_wonums", "intersection_column"]]

Unnamed: 0,similar_clean_ldtext_wonums,similar_clean_description_wonums,intersection_column
9,"TLW1714634,TLW1870670,TLW1833607,1051812,1327827,TLW1867440,TLW1871217,TLW1821430,TLW1723053,TLW1789052,TLW1854232,1052395,1456233,1049725,1207200,1398689,TLW1564062,TLW1540735,1014369,1504493,120...","1321740,1076960,1330725,TLW2205528,TLW1649520,TLW2057978,1125488,TLW1540709,TLW1813987,TLW1996045,1452309,TLW1917729,1274781,TLW2416776,TLW2082180,TLW1765699,1396087,TLW1677764,TLW1969103,1250126,...","{1501732, TLW1848283, TLW1557412, 1498842, TLW1833612, 1314366, 1162992, TLW1813880, 1160186, TLW1823475}"
40,"TLW1870670,TLW1714634,TLW1833607,TLW1871217,1504493,TLW1867440,TLW1821430,1051812,1456233,1327827,1049725,TLW1854232,TLW1537256,TLW1728015,1138251,1052395,TLW1625179,TLW1773902,1014369,1308613,120...","1274781,TLW1825555,TLW1845511,1321740,1076960,1250126,TLW1817321,1125488,TLW1818014,1111564,TLW1724503,1054413,TLW2210115,TLW1837055,TLW1821457,TLW1886215,TLW1829196,1139794,TLW1848526,TLW1841784,...","{TLW1882355, TLW1844332, TLW1813880, TLW1853638, TLW1870670, TLW1848283, 1207864, TLW1557412, 1498842, TLW1869676, 1314366, 1160186, 1501732, 1323354, TLW1818004, 1162992, TLW1761604, TLW1853637, ..."
77,"TLW1870670,TLW1714634,TLW1833607,TLW1867440,TLW1871217,TLW1854232,1207200,TLW1728015,1049725,1327827,1056414,TLW1564062,1308613,1051812,1052395,1504493,1456233,TLW1723053,TLW1789052,TLW1751683,TLW...","1452309,TLW2110718,1239241,1004982,TLW1813987,1250126,1125488,1143704,1306217,1321740,TLW2082180,TLW1823445,TLW1649520,1073453,1054413,TLW1813677,1069062,1322591,TLW1724503,1075775,1076960,1021894...","{TLW1739020, 1162992}"
154,"TLW1768484,TLW1768485","TLW1826091,TLW2305365,1076960,TLW1677764,1396087,TLW1583981,TLW1768484,TLW2265602,TLW1625944,1125488,1057089,TLW2416866,TLW1761866,TLW1722972,TLW1758107,1378491,TLW1647432,TLW2268252,TLW1825555,TL...","{TLW1768484, TLW1768485}"
158,"TLW1870670,TLW1714634,TLW1564062,TLW1833607,TLW1867440,TLW1854232,TLW1871217,TLW1821430,1051812,TLW1728015,TLW1723053,1504493,1049725,1327827,TLW1732162,1010815,1207200,1138251,TLW1862958,TLW17516...","TLW1742416,TLW2297067,TLW1770507,TLW1941182,TLW2071532,TLW2309512,TLW1625241,TLW1899787,TLW2348234,TLW2040616,TLW1660276,TLW1791745,TLW1970047,TLW2235311,TLW1764276,TLW1658013,TLW2312317,TLW231444...","{TLW1528931, TLW1657444}"
...,...,...,...
92555,"TLW1962772,TLW1870670,TLW1714634,TLW1867440,TLW1789052,TLW1854232,TLW1871217,TLW1821430,1051812,TLW1994974,TLW1564062,1308613,TLW2289545,1456233,1138251,1207200,TLW1773902,1010815,1327827,1049725,...","TLW2363109,1483318,1511595,TLW1930978,TLW1588021,1343659,1367518,TLW2227036,TLW1689818,TLW1688852,1107749,TLW1520047,1411667,TLW2189950,TLW2314916,TLW1614583,TLW1703055,TLW1524824,TLW1589840,TLW21...","{TLW2266266, 1447825}"
92567,11205011120500,11205011120500,"{1120501, 1120500}"
92657,"TLW1870670,TLW1714634,TLW1871217,TLW1867440,1051812,TLW1833607,1456233,TLW1854232,1049725,1207456,1327827,1308613,TLW1564062,1052395,TLW1728015,TLW1537256,TLW1773902,TLW1789052,1207200,1504493,101...","1483377,TLW1956211,1511595,TLW2151453,TLW2227036,TLW1730324,TLW1566979,1264932,TLW1526643,TLW2476644,TLW1787167,TLW1518654,TLW2221804,TLW2324530,TLW2160640,TLW1524827,TLW1870396,TLW2481871,TLW2115...","{TLW1805204, 1497646}"
92814,"TLW1628326,TLW2119871,TLW1718837,TLW1721890,TLW2055790,TLW2192566,TLW1989041,TLW1909860,TLW2106650,TLW2006515,TLW1722816,TLW1518047,TLW1632325,1490352,TLW2262056,TLW2024705,TLW1852190,TLW2394299,T...","TLW1852190,TLW1937190,TLW1938320,TLW1901757,TLW1929441,TLW2031171,TLW2314367,TLW2272435,TLW1618099,TLW1849389,TLW1894122,TLW1956541,TLW1936482,TLW2195534,TLW1976723,TLW1570732,TLW1983405,TLW176107...","{TLW1852190, TLW2262056}"


In [None]:
test_df.iloc[198]

NameError: name 'test_df' is not defined

In [None]:
test_df["intersection_column"].apply(lambda x: len(x)).describe()

count    92979.000000
mean         0.575366
std          0.955835
min          0.000000
25%          0.000000
50%          0.000000
75%          1.000000
max         28.000000
Name: intersection_column, dtype: float64

In [None]:
import random

a=test_df[test_df["similar_ld_wonums"] != ''].index
#rand_indx = (random.randint(0,len(a)))
won_lst = test_df.loc[a[rand_indx], "similar_ld_wonums"].split(',')
test_df[test_df["wonum"].isin(won_lst)]#.groupby("similar_ld_wonums").count()

KeyError: 'similar_ld_wonums'

In [None]:
test_df.describe()

Unnamed: 0,wopriority,actstart,clean_ldtext_size,clean_description_size,count,group_id,exact_match
count,89768.0,87724,92979.0,92979.0,0.0,0.0,92979.0
mean,5.871925,2020-05-02 17:00:16.605889024,192.563902,46.578496,,,0.0
min,0.0,2015-06-23 15:46:24,2.0,2.0,,,0.0
25%,4.0,2018-07-30 16:21:47.750000128,7.0,34.0,,,0.0
50%,6.0,2020-02-18 12:35:20.500000,79.0,45.0,,,0.0
75%,9.0,2022-03-16 14:44:10,181.0,57.0,,,0.0
max,98.0,2024-03-18 08:50:49,47690.0,137.0,,,0.0
std,2.560169,,532.392297,17.274453,,,0.0


In [None]:
test_df[test_df['wonum'].isin(ast.literal_eval(test_df.iloc[0]["similar_ldtext_wonums"]))]['clean_ldtext']

Unnamed: 0,wonum,description,ldtext,mats_assigned,wopriority,actstart,clean_ldtext,clean_description,basic_clean_ldtext,basic_clean_description,clean_ldtext_size,clean_description_size,count,group_id,exact_match,similar_ldtext_wonums
9339,TLW2334770,"shoes worn below limit, replacement required",401 B Side Bogie 2 403 B Side Bogie 1 403 B Si...,,9.0,2023-08-30 11:30:00,"['b', 'side', 'bogie', 'b', 'side', 'bogie', '...","['shoe', 'worn', 'limit', 'replacement', 'requ...","[b, side, bogie, b, side, bogie, b, side, bogie]","[shoe, worn, limit, replacement, required]",66.0,52.0,,,0.0,
17998,TLW1984341,412 side B bogie 1 & 410 side A arc shields da...,412 side B bogie 1 410 side A bogie 1,,9.0,2022-02-15 15:55:00,"['side', 'b', 'bogie', 'side', 'bogie']","['side', 'b', 'bogie', 'side', 'arc', 'shield'...","[side, b, bogie, side, bogie]","[side, b, bogie, side, arc, shield, damaged, s...",39.0,92.0,,,0.0,
43000,TLW1648264,Shoes need replacing (see long),403 Bogie 2 side A 410 Bogie 1 side B,,9.0,2020-02-22 11:03:53,"['bogie', 'side', 'bogie', 'side', 'b']","['shoe', 'need', 'replacing', 'see', 'long']","[bogie, side, bogie, side, b]","[shoe, need, replacing, see, long]",39.0,44.0,,,0.0,
53505,TLW2054792,Shoe Replace,401 Bogie 2 B side 403 Bogie 2 A side,,9.0,2022-06-17 21:00:00,"['bogie', 'b', 'side', 'bogie', 'side']","['shoe', 'replace']","[bogie, b, side, bogie, side]","[shoe, replace]",39.0,19.0,,,0.0,
54267,1340356,Two shoes found worn beyond limits,410 Side B bogie 1 401 bogie 1 side B,,3.0,2018-09-02 09:33:15,"['side', 'b', 'bogie', 'bogie', 'side', 'b']","['two', 'shoe', 'found', 'worn', 'beyond', 'li...","[side, b, bogie, bogie, side, b]","[two, shoe, found, worn, beyond, limit]",44.0,51.0,,,0.0,
55097,TLW1885089,Shoes require replacing see long,401 bogie 1 B side 401 bogie 2 B side,,9.0,2021-07-08 04:00:52,"['bogie', 'b', 'side', 'bogie', 'b', 'side']","['shoe', 'require', 'replacing', 'see', 'long']","[bogie, b, side, bogie, b, side]","[shoe, require, replacing, see, long]",44.0,47.0,,,0.0,
106758,TLW1778223,Shoes to be replaced,401 bogie 1 b side 401 bogie 2 b side 412 bogi...,,9.0,2020-12-01 06:28:36,"['bogie', 'b', 'side', 'bogie', 'b', 'side', '...","['shoe', 'replaced']","[bogie, b, side, bogie, b, side, bogie, b, side]","[shoe, replaced]",66.0,20.0,,,0.0,
124692,TLW1622468,7 x secondary vertical dampers excessive leaking,406 bogie 1 & 2 B side 407 bogie 1 B side 410 ...,,9.0,2020-01-16 08:34:00,"['bogie', 'b', 'side', 'bogie', 'b', 'side', '...","['x', 'secondary', 'vertical', 'damper', 'exce...","[bogie, b, side, bogie, b, side, bogie, side, ...","[x, secondary, vertical, damper, excessive, le...",95.0,64.0,,,0.0,
134813,TLW2000303,shoe collector worn out see long description,412 bogie 2 b side 410 bogie 1 a and b side 40...,,9.0,2022-03-18 18:25:34,"['bogie', 'b', 'side', 'bogie', 'b', 'side', '...","['shoe', 'collector', 'worn', 'see', 'long', '...","[bogie, b, side, bogie, b, side, bogie, b, side]","[shoe, collector, worn, see, long, description]",66.0,59.0,,,0.0,
169545,TLW2386736,Bogie earth straps broken - See Long,404 Bogie 2 Side B 405 Bogie 2 Side A,,9.0,2023-11-12 09:32:09,"['bogie', 'side', 'b', 'bogie', 'side']","['bogie', 'earth', 'strap', 'broken', 'see', '...","[bogie, side, b, bogie, side]","[bogie, earth, strap, broken, see, long]",39.0,52.0,,,0.0,


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np 

n=10000
sim_matrix=cosine_similarity(document_vectors[0:n],document_vectors[0:n]) 
sim_matrix.shape

workorders_matrix=np.zeros(shape=(n,n))

for i in range (0,n):
    similar_i_workorders=[]
    for j in range(0,n):
        similarity=a[i][j]
        if (similarity>0.85) and (i!=j):
            similar_i_workorders.append(df.iloc[j]["wonum"])
    df.loc[i,"similar_text_description_wonum"] = str(similar_i_workorders)
 
df

In [None]:
df[~df["similar_text_description_wonum"].isna()].sort_values("actstart", ascending=0).iloc[190]["similar_text_description_wonum"]

In [None]:
df.to_csv(r"C:\Users\z004n6cr\Desktop\TOPicks\Hackathon\initial_results.csv",index=False)

In [None]:
df[df["wonum"].isin(['1457649', 'TLW2081497', 'TLW1928621', 'TLW2104386', 'TLW1515980', 'TLW1941147', 'TLW1765068', 'TLW1841043', 'TLW1570120', 'TLW2115522', 'TLW1925629', '1504482', 'TLW2392722', 'TLW2146670', 'TLW1675387', 'TLW1878817', 'TLW1810186', 'TLW1648849', 'TLW2440300', 'TLW2084654', 'TLW2036188', 'TLW2459601', 'TLW1551039', '1502623', 'TLW1995869', '1510140', 'TLW2129265', 'TLW1798762', 'TLW1914005', '1384816', 'TLW1546648', 'TLW1714946', 'TLW1607398', 'TLW1733336', 'TLW1872768', 'TLW1833112', 'TLW2189963', '1510333', 'TLW1894243', 'TLW2441926', 'TLW2463882', 'TLW1866558', '1384803', 'TLW1863087', 'TLW2390708', 'TLW1979080', 'TLW2376574', 'TLW1772177', 'TLW2273016', 'TLW2424505', 'TLW1855301', 'TLW2124742', 'TLW2435707', 'TLW2104497', 'TLW2285449', 'TLW2268182', 'TLW1536950', 'TLW2432527', 'TLW1839486', 'TLW1703252', 'TLW2343436', 'TLW2321146', '1391028', 'TLW2037210', 'TLW2299149', 'TLW1722063', 'TLW1742412', '1384826', 'TLW1780184', 'TLW2364377', 'TLW1722195', 'TLW1763576', 'TLW2130362', 'TLW2334309', 'TLW2116411', '1417042', 'TLW1541175', 'TLW2328395', '1460988', 'TLW1909892', 'TLW1553599', 'TLW2045694', 'TLW2372519', 'TLW1814278', 'TLW1556606', 'TLW1621813', 'TLW1757708', 'TLW1944640', 'TLW2232925', 'TLW1635007', 'TLW1761857', 'TLW1985150', 'TLW1966913', 'TLW1871331', 'TLW1597836', 'TLW1963120', 'TLW2262155', '1444857', 'TLW1530768', 'TLW2365513', 'TLW1674342', 'TLW1821452', 'TLW2406901', 'TLW1569013', 'TLW2289550', 'TLW1543098', '1405753', 'TLW1629109', 'TLW1982096', 'TLW1601213', 'TLW1951509', 'TLW2413705', 'TLW2222707', 'TLW2267923', 'TLW1618550', '1507382', 'TLW1880371', 'TLW1964408', 'TLW1524408', 'TLW1703722', 'TLW1554012', '1509813', 'TLW1778973', 'TLW1537936', 'TLW1730659', 'TLW1848525', 'TLW2165051', '1460685', 'TLW1898626', 'TLW2343830', 'TLW1877462', 'TLW1890389', 'TLW2407122', '1433223', 'TLW1841648', '1482974', 'TLW2085839', '1498060', 'TLW1542131', 'TLW2250986', '1490857', 'TLW1692305', '1395802', 'TLW2263662', 'TLW1583549', 'TLW2127672', 'TLW1846853', '1467451', 'TLW2345967', 'TLW1900760', 'TLW1700726', 'TLW1872409', 'TLW2107815', '1489231', 'TLW1994097', 'TLW1542186', 'TLW1601220', 'TLW2295896', 'TLW2472153', '1420850', 'TLW2363179', 'TLW2133731', 'TLW2303538', 'TLW1820221', 'TLW2314855', 'TLW2466780', 'TLW1634696', 'TLW1753210',
                     'TLW2046649', 'TLW1822971', 'TLW1742671', 'TLW2259254', 'TLW2039946', 'TLW1711805', 'TLW2003813', 'TLW1852914', 'TLW1546435', 'TLW1744967', 'TLW1959668', 'TLW1781682', 'TLW2390586', 'TLW2043372', '1408428', 'TLW1600456', 'TLW1935302', 'TLW1765682', 'TLW1714685', 'TLW2156609', 'TLW1691203', 'TLW1743707', 'TLW1851251', 'TLW2128089', 'TLW1598903', 'TLW2129249', 'TLW2293122', 'TLW2241466', 'TLW1952280', 'TLW1856233', 'TLW1812024', '1420647', 'TLW2465416', 'TLW1700430', 'TLW1684254', 'TLW2006276', 'TLW1585795', '1468133', 'TLW2079808', 'TLW2228857', 'TLW1756140', 'TLW1943115', '1480041', 'TLW1515964', 'TLW2484063', 'TLW2033017', 'TLW1887465', 'TLW2056175', 'TLW1855019', 'TLW2408236', 'TLW1923458', 'TLW1840186', 'TLW1589380', 'TLW2059894', 'TLW1572015', '1409192', 'TLW1573068', '1459195', 'TLW1710846', 'TLW2127390', '1452252', 'TLW1608653', '1495200', 'TLW2114674', 'TLW2235915', 'TLW1853636', 'TLW1658893', 'TLW1882988', 'TLW2281614', 'TLW1823195', 'TLW2037245', 'TLW2458002', 'TLW2203286', 'TLW2082191', 'TLW1776316', 'TLW1914195', 'TLW1784754', 'TLW1622332', 'TLW1919987', 'TLW1578215', 'TLW2352844', 'TLW2266115', '1487949', 'TLW1515801', 'TLW2360510', 'TLW2195015', 'TLW1762394', '1399803', 'TLW1974138', '1514846', 'TLW1762582', 'TLW1692391', '1509085', '1458815', 'TLW1677161', 'TLW1997916', 'TLW1617431', '1406826', 'TLW2178638', 'TLW1897337', 'TLW2071414', 'TLW2187153', 'TLW1761620', '1480450', 'TLW2102173', 'TLW1727446', 'TLW2145643', '1454646', 'TLW1688306', 'TLW1918644', 'TLW2227356', 'TLW2352264', 'TLW1917176', 'TLW1619582', 'TLW1809555', 'TLW2047846', 'TLW1704227', 'TLW1524590', 'TLW2331774', 'TLW1830833', 'TLW1991893', 'TLW1783039', 'TLW1684943', 'TLW1911104', 'TLW1910878', 'TLW1912006', 'TLW1520250', 'TLW1898551', 'TLW1994920', 'TLW1694229', 'TLW2295002', 'TLW1679594', 'TLW1965350', 'TLW1694796', 'TLW1607248', 'TLW2474180', 'TLW2059690', 'TLW2146289', 'TLW2045379', 'TLW1773913', 'TLW1692017', '1443116', 'TLW1790713', 'TLW1808775', 'TLW1558048', 'TLW1880915', 'TLW1633162', 'TLW2216457', 'TLW2007812', 'TLW2221118', 'TLW1546603', 'TLW1807652', 'TLW1533683', 'TLW2468517', 'TLW1729540', 'TLW1694156', 'TLW1722191', 'TLW1779953', 'TLW2327458', '1457093', 'TLW1788724', 'TLW1924639', 'TLW2183221', 'TLW2186711', 'TLW1997895', 'TLW2425889', 'TLW1995067', 'TLW2191123'])]["description"].value_counts()

In [None]:
df.sort_values("clean_ldtext_size", ascending=0).loc[49812, "clean_ldtext"]

In [None]:
def comb_lists(l1: list[str],l2: list[str]):
    out=l1.extend(l2)

    return out

In [None]:
df=( df[(~df["description"].isna())  &  (~df["ldtext"].isna())] )
df

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# Sample data (replace with your own)
documents = df["clean_ldtext"] 

# documents = ["This is the first document.",
#  "This document is the second document.",
#  "And this is the third one.",
#  "Is this the first document?"]

# TaggedDocument(words="['side', 'b', 'bogie', 'side', 'bogie', 'side', 'b', 'bogie']", tags=['0']),
# TaggedDocument

# Tokenize the documents
tokenized_documents = [doc for doc in documents]

# Create tagged document corpus
tagged_data = [TaggedDocument(words=doc, tags=[str(i)]) for i, doc in enumerate(tokenized_documents)]
tagged_data[0:5]

In [None]:
###TRY TO FIND WORD EMBEDDINGS FOR COMBINED LD CLEAN + DSC CLEAN ####
#[ [x,y] for x,y in (zip(df2["clean_description"],df2["clean_ldtext"]))]

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["description_cleaned"])
vectorizer.get_feature_names_out()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

clusters_df=pd.read_csv(r"C:\Users\z004n6cr\Desktop\TOPicks\Hackathon\data\cluster_data.csv")
# Assuming df is your DataFrame with a column named "description_clean"
texts = df["description_cleaned"].tolist()

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(texts)

# K-means Clustering
k = len(clusters_df)  # Number of clusters
kmeans = KMeans(n_clusters=k)
kmeans.fit(tfidf_matrix)

# Assign cluster labels to the original text data
clusters = kmeans.labels_

# Combine text data with cluster labels
data_with_clusters = pd.DataFrame({'Text': texts, 'Cluster': clusters})

# Print the results
print(data_with_clusters)


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

# Sample DataFrame
data = {'concatenated_strings': ['apple,banana,apple,orange', 'banana,banana,orange', 'apple,apple,orange']}
df = pd.DataFrame(data)

vectorizer = CountVectorizer(tokenizer=lambda x: x.split(','))

X = vectorizer.fit_transform(df['concatenated_strings'])
# Scaling
scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X)

# K-Means
kmeans_model = KMeans(n_clusters=2, random_state=42)
kmeans_clusters = kmeans_model.fit_predict(X_scaled)

# DBSCAN
dbscan_model = DBSCAN(eps=1, min_samples=1)
dbscan_clusters = dbscan_model.fit_predict(X_scaled)
df['kmeans_cluster'] = kmeans_clusters
df['dbscan_cluster'] = dbscan_clusters

# Evaluate clustering using silhouette score
silhouette_kmeans = silhouette_score(X_scaled, kmeans_clusters)
silhouette_dbscan = silhouette_score(X_scaled, dbscan_clusters)

print("K-Means Silhouette Score:", silhouette_kmeans)
print("DBSCAN Silhouette Score:", silhouette_dbscan)
print(df)


In [None]:
fdist = FreqDist( word_tokenize(df))

In [None]:
# #Stemming
# stemmer = PorterStemmer()
# stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
# print(stemmed_tokens)

# #Lemmatization
# lemmatizer = WordNetLemmatizer()
# lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
# print(lemmatized_tokens)

In [None]:
# Convert text to lowercase
normalized_text = text.lower()
print(normalized_text)

# Remove numbers
normalized_text = ''.join([word for word in normalized_text if not word.isdigit()])
print(normalized_text)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(sublinear_tf=True)
X = vectorizer.fit_transform(test['text_cleaned']).toarray()
X


In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")

# Our sentences to encode
sentences = [
    "This framework generates embeddings for each input sentence",
    "Sentences are passed as a list of string.",
    "The quick brown fox jumps over the lazy dog."
]

# Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

# Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

In [None]:
import time
import numpy as np
st = time.time()

model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
test['encode_transforemers'] = test['text_cleaned'].apply(lambda text: model.encode(text, convert_to_numpy=True).flatten())

et = time.time()

print("Elapsed time: {:.2f} seconds".format(et - st))

X_transformers = np.vstack(test['encode_transforemers'])
#Glov

In [None]:
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

documents = [
    TaggedDocument(words=['the', 'sun', 'is', 'shining'], tags=['doc1']),
    TaggedDocument(words=['the', 'sun', 'is', 'shining'], tags=['doc2']),
    # Add more documents as needed
]

model = Doc2Vec(vector_size=100, window=5, min_count=1, workers=4)
model.build_vocab(documents)
model.train(documents, total_examples=model.corpus_count, epochs=10)

# Infer vectors for the documents

# Calculate cosine similarity between the vectors
similarity = model.dv.similarity('doc1', 'doc2')


print("Cosine Similarity between doc1 and doc2:", similarity)