In [None]:
# Tokenazation example with 'Kupfer, blank I, Millberry'
import spacy
import spacy_transformers
import de_dep_news_trf
import en_core_web_sm

# Load the en_core_web_sm model
de_nlp = spacy.load('de_dep_news_trf')

# Load the en_core_web_sm model
en_nlp = spacy.load('en_core_web_sm')

# Create a Doc object for testing
doc = de_nlp('Kupfer, blank I, Millberry')

# Generate the tokens
tokens = [token.text for token in doc]

# Print results
print(tokens)

## Tokenization

The next step is to tokenize our data. Tokenization is the process of separating our text into tokens, which can be defined as one or multiple units. There are several techniques to do this. These are the steps we are going to take in preprocessing:

1. Converting words into lowercase
2. Removing leading and trailing whitespaces
3. Converting words into lemmas
4. Removing punctuation
5. Removing stopwords
6. Expanding contractions
7. Removing special characters

Note that most of our text is in German. Therefore, we need to leverage the parameters of the [spaCy documentation](https://spacy.io/usage/models)
Because of this, we will need an algorithm to determine in what language the text is (or is likely to be) to perform the best possible tokenization.

_Note, after loading the spacy package you might need to reboot your computer (the first time). If you are getting this error:_
> ValueError: [E002] Can't find factory for 'transformer' for language English (en). This usually happens when spaCy calls `nlp.create_pipe` with a custom component name that's not registered on the current language class. If you're using a Transformer, make sure to install 'spacy-transformers'. If you're using a custom component, make sure you've added the decorator `@Language.component` (for function components) or `@Language.factory` (for class components).

As part of the baseline exercise, we will use standard stopwords. However, a potential improvement is to build custom stopword lists tailored to the use-case of scrap metal.


In [None]:
# Get german default stopwords
de_stopwords  = spacy.lang.de.stop_words.STOP_WORDS

# Get english default stopwords
en_stopwords = spacy.lang.en.stop_words.STOP_WORDS

At this step, we will apply a preprocessing step that converts words to lemmas. Lemmas are a basic form of a word that removes inflexions and therefore decreases variance. For example, the lemma for a verb is its infinitive form.


In [None]:
uninteresting_char = set([',', '(', ')', 'und'])

def preprocess(text, language='en'):
    # Create Doc object
        if language == 'de':
                doc = de_nlp(text, disable=['ner', 'parser'])
                stopwords = de_stopwords
        else:
                doc = en_nlp(text, disable=['ner', 'parser'])
                stopwords = en_stopwords
        lemmas = [token.lemma_ for token in doc]
        # Remove stopwords and non-alphabetic characters
        a_lemmas = [lemma for lemma in lemmas 
                if lemma not in stopwords
                and lemma not in uninteresting_char]
        return ' '.join(a_lemmas)

# Apply preprocess to the mateirial alias given the language
df1['p_ds_material_alias'] = df1.apply(lambda x: preprocess(x['ds_material_alias'], x['cd_alias_language']), axis=1)

In [None]:
# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Create CountVectorizer object
vectorizer = CountVectorizer()

corpus = df1['p_ds_material_alias'].values

# Generate matrix of word vectors
bow_matrix = vectorizer.fit_transform(corpus)

# Convert bow_matrix into a DataFrame
bow_df = pd.DataFrame(bow_matrix.toarray())

# Map the column names to vocabulary 
bow_df.columns = vectorizer.get_feature_names_out()

In [None]:
# Plot top 50 words frequencies
import matplotlib.pyplot as plt

# Sum the frequencies of each word
word_freq = bow_df.sum()

# Plot the top 50 words
word_freq.sort_values(ascending=False).head(50).plot(kind='bar', figsize=(15, 7))
plt.title('Top 50 words frequencies')
plt.ylabel('Frequency')
plt.xlabel('Words')
plt.show()


In [None]:
# Import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TfidfVectorizer object
vectorizer = TfidfVectorizer()

# Generate matrix of word vectors
tfidf_matrix = vectorizer.fit_transform(ted)

# Print the shape of tfidf_matrix
print(tfidf_matrix.shape)

## Calculating the cosine similarity

# Initialize an instance of tf-idf Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Generate the tf-idf vectors for the corpus
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

# Compute and print the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_sim)

In [None]:
# Initialize the TfidfVectorizer 
tfidf = TfidfVectorizer(stop_words='english')

# Construct the TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(transcripts)

# Generate the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix,tfidf_matrix1)

# Generate mapping between titles and index
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()

def get_recommendations(title, cosine_sim, indices):
    # Get index of movie that matches title
    idx = indices[title]
    # Sort the movies based on the similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores for 10 most similar movies
    sim_scores = sim_scores[1:11]
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    # Return the top 10 most similar movies
    return metadata['title'].iloc[movie_indices]

In [None]:
We should create word embeddings for the different forms of metals. For instance:
Kupfer, Copper, Cu should all be the same concept. --> Would increase the performance of the model without the need of so much data

# Import train_test_split

from sklearn.model_selection import train_test_split

X = material_desc_df.drop('product_id', axis=1)
y = material_desc_df['product_id']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Initialize the TfidfVectorizer 
tfidf = TfidfVectorizer()

# Corpus
corpus_orig = material_desc_df['text'].to_list()

# Construct the TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(corpus)

# Generate the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix,tfidf_matrix)

# Generate mapping between titles and index
indices = pd.Series(material_desc_df.index, index=material_desc_df['product_id']).drop_duplicates()

# Create function to return top n matches
def get_top_n_matches(material, cosine_sim, indices, n=10):
    # Add new material to corpus
    corpus = corpus_orig.append(material)
    # Construct the TF-IDF matrix
    tfidf_matrix = tfidf.fit_transform(corpus)
    # Generate the cosine similarity matrix
    cosine_sim = linear_kernel(tfidf_matrix,tfidf_matrix)
    # Order the results by similarity
    sim_scores = list(enumerate(cosine_sim[-1]))



In [None]:
# Get cart_items
materials = pd.read_csv('data_git/material_aliases.csv')

materials['cd_product_id']

In [None]:
# Initialize the TfidfVectorizer 
tfidf = TfidfVectorizer()

# Corpus
corpus = material_desc_df['text'].to_list()

# Material to test
material = 'Copper milb'

# Add new material to corpus
corpus.append(material)
# Construct the TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(corpus)
# Generate the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix,tfidf_matrix)

# Get scores of the last material
sim_scores = list(enumerate(cosine_sim[-1]))

# Sort the materials based on the similarity scores
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

# Get the scores for 10 most similar materials
sim_scores = sim_scores[1:11]

# Get the material indices
material_indices = [i[0] for i in sim_scores]

# Return the top 10 most similar materials
top_n_scores = material_desc_df['product_id'].iloc[material_indices]

# Create DataFrame with top 10 most similar materials and their scores
top_n_scores_df = pd.DataFrame({'product_id': top_n_scores, 'similarity_score': [i[1] for i in sim_scores]})

# Merge with materials DataFrame to get material details
result = pd.merge(top_n_scores_df, materials, on='product_id')

result

In [None]:
path = os.path.join(os.getcwd(), "data/")

# Load the data
materials = pd.read_csv(path + 'materials.csv')
material_desc_df = pd.read_csv(path + 'material_desc.csv')

# Initialize the TfidfVectorizer 
tfidf = TfidfVectorizer()

# Corpus
corpus = material_desc_df['text'].to_list()

# Material to test
material = 'Copper milb'

# Add new material to corpus
corpus.append(material)
# Construct the TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(corpus)
# Generate the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix,tfidf_matrix)

# Get scores of the last material
sim_scores = list(enumerate(cosine_sim[-1]))

# Sort the materials based on the similarity scores
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

# Get the scores for 10 most similar materials
sim_scores = sim_scores[1:11]

# Get the material indices
material_indices = [i[0] for i in sim_scores]

# Return the top 10 most similar materials
top_n_scores = material_desc_df['product_id'].iloc[material_indices]

# Create DataFrame with top 10 most similar materials and their scores
top_n_scores_df = pd.DataFrame({'product_id': top_n_scores, 'similarity_score': [i[1] for i in sim_scores]})

# Merge with materials DataFrame to get material details
result = pd.merge(top_n_scores_df, materials, on='product_id')

result