# Exploration and Preprocessing

In [27]:
# Import relevant packages
import pandas as pd
import numpy as np
import os
import json

## Data Loading

In [28]:
        
# Read JSON file into a dictionary
with open('../../data/raw/startup_list_0_5000_without_cookie.json', 'r') as file:
    data = json.load(file)

# Create a list of dictionaries to hold the row data
rows = []
for key, value in data.items():
    row = value.copy()
    row['name'] = value.pop('name')
    rows.append(row)


# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(rows)
df

Unnamed: 0,name,original_idx,website_url,website_text
0,itravel,0,https://www.itravel.de/,itravel Telefonische Beratung +49 221 8282 888...
1,kunveno,2,https://kunveno.de,Kunveno - Work Happier WORK HAPPIER Booste Unt...
2,t2k: Text to Knowledge,3,https://text2knowledge.de,Text 2 Knowledge You.
3,Scopas,4,https://www.scopas.io/,ScopasWe're working on something new.Check it ...
4,studymaniac,5,https://studymaniac.de,Studymaniac - erfolgreich Studieren Open main ...
...,...,...,...,...
3284,Coleap,4994,https://coleap.com/,Coleap Transform your content into incomeTurn ...
3285,Trade Machines FI,4995,http://trademachines.com,▷ All used industrial equipment online on Trad...
3286,Yasoon,4996,http://yasoon.com/,Home - yasoon Patrick Partner Manager Get in t...
3287,Phoneboost,4997,https://www.phoneboost.de/,Phoneboost – Boost up your phone! AKKU LEER?PH...


## Cleaning the Html as preprocessing step for text translation

In [29]:
import re

#functions to clean Html as first step of the preprocessing
def cleanHtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    cleantext = cleantext.replace("\n"," ")
    cleantext = cleantext.replace("√ú","Ü")
    cleantext = cleantext.replace("√ü","ß")
    cleantext = cleantext.replace("√∂", "ö")
    cleantext = cleantext.replace("√º", "ü")
    cleantext = cleantext.replace("√§", "ä")
    cleantext = cleantext.replace("&", "and")
    cleantext = cleantext.replace("‚Äú", " ")
    cleantext = cleantext.replace("¬†", " ")
    return cleantext

df['website_text'] = df['website_text'].apply(cleanHtml)
df

Unnamed: 0,name,original_idx,website_url,website_text
0,itravel,0,https://www.itravel.de/,itravel Telefonische Beratung +49 221 8282 888...
1,kunveno,2,https://kunveno.de,Kunveno - Work Happier WORK HAPPIER Booste Unt...
2,t2k: Text to Knowledge,3,https://text2knowledge.de,Text 2 Knowledge You.
3,Scopas,4,https://www.scopas.io/,ScopasWe're working on something new.Check it ...
4,studymaniac,5,https://studymaniac.de,Studymaniac - erfolgreich Studieren Open main ...
...,...,...,...,...
3284,Coleap,4994,https://coleap.com/,Coleap Transform your content into incomeTurn ...
3285,Trade Machines FI,4995,http://trademachines.com,▷ All used industrial equipment online on Trad...
3286,Yasoon,4996,http://yasoon.com/,Home - yasoon Patrick Partner Manager Get in t...
3287,Phoneboost,4997,https://www.phoneboost.de/,Phoneboost – Boost up your phone! AKKU LEER?PH...


# Text translation

In [32]:
from langdetect import detect

#count number of texts in different langauges 
lang_counters = {}
deleted_row_counter = 0
for index, row in df.iterrows():
    if len(row['website_text']) >= 24 and row['website_text'] != 'Error: Failed to get response':
        language = detect(row['website_text'])
        df.at[index, 'language'] = language
        if language in lang_counters:
            lang_counters[language] += 1
        else:
            lang_counters[language] = 1
    
    if len(row['website_text']) == 0 or row['website_text'] == 'Error: Failed to get response':
        print(f"\033[33m Attention: \033[0m At index {index} no text is found and will be deleted.")
        deleted_row_counter += 1
        df = df.drop(index)
    if len(row['website_text']) <= 24 and len(row['website_text']) != 0:
        print(f"\033[33m Attention: \033[0m At index {index} text is too short and will be deleted: {row['website_text']}")
        deleted_row_counter += 1
        #delete row from Dataframe with empty text
        df = df.drop(index)
    if 'Seite wurde nicht gefunden' in row['website_text']:
        print(f"\033[33m Attention: \033[0m At index {index} website could not be scraped and will be deleted: {row['website_text']}")
        deleted_row_counter += 1
        # Lösche die Zeilen, die den Text enthalten
        df = df.drop(index)
    if 'Site is undergoing maintenance' in row['website_text']:
        print(f"\033[33m Attention: \033[0m At index {index} website is under maintanenance and will be deleted: {row['website_text']}")
        deleted_row_counter += 1
        # Lösche die Zeilen, die den Text enthalten
        df = df.drop(index)
        
print(f"Numbers of texts in different languages: {lang_counters}")
print(f"Number of deleted rows: {deleted_row_counter}")

[33m Attention: [0m At index 2 text is too short and will be deleted: Text 2 Knowledge You.
[33m Attention: [0m At index 5 text is too short and will be deleted: Homepage - Numaferm
[33m Attention: [0m At index 20 text is too short and will be deleted: BidX Tool
[33m Attention: [0m At index 22 text is too short and will be deleted: Error: 503
[33m Attention: [0m At index 67 text is too short and will be deleted: Field 33
[33m Attention: [0m At index 83 text is too short and will be deleted: Error: 403
[33m Attention: [0m At index 97 no text is found and will be deleted.
[33m Attention: [0m At index 108 no text is found and will be deleted.
[33m Attention: [0m At index 116 text is too short and will be deleted: www.lieferando.de
[33m Attention: [0m At index 120 website could not be scraped and will be deleted: Seite wurde nicht gefunden. • Die LOYAL App Search for: 404 Error 404 Not Found Oops! That page can’t be found. It looks like nothing was found at this locatio

In [35]:
choosen_lang ='en'

df[f'website_text_in_{choosen_lang}'] = None
df

Unnamed: 0,name,original_idx,website_url,website_text,language,website_text_in_en
0,itravel,0,https://www.itravel.de/,itravel Telefonische Beratung +49 221 8282 888...,de,
1,kunveno,2,https://kunveno.de,Kunveno - Work Happier WORK HAPPIER Booste Unt...,de,
3,Scopas,4,https://www.scopas.io/,ScopasWe're working on something new.Check it ...,en,
4,studymaniac,5,https://studymaniac.de,Studymaniac - erfolgreich Studieren Open main ...,de,
6,Aicone,9,http://www.ai-c.one,Aicone - Artificial Intelligence Cloud One | A...,en,
...,...,...,...,...,...,...
3284,Coleap,4994,https://coleap.com/,Coleap Transform your content into incomeTurn ...,en,
3285,Trade Machines FI,4995,http://trademachines.com,▷ All used industrial equipment online on Trad...,en,
3286,Yasoon,4996,http://yasoon.com/,Home - yasoon Patrick Partner Manager Get in t...,en,
3287,Phoneboost,4997,https://www.phoneboost.de/,Phoneboost – Boost up your phone! AKKU LEER?PH...,de,


In [39]:
from deep_translator import GoogleTranslator

#translate all texts which are not written in the choosen language 
for index, row in df.iterrows():
    if row[f'website_text_in_{choosen_lang}'] == None:
        if len(row['website_text']) > 4900:
            # Split the text into smaller chunks for translation
            chunk_size = 4900  # Choose chank size
            chunks = [row['website_text'][i:i+chunk_size] for i in range(0, len(row['website_text']), chunk_size)]

            # Translate each chunk and join them back
            translated_chunks = []
            for chunk in chunks:
                translation = GoogleTranslator(source='auto', target=choosen_lang).translate(chunk)
                translated_chunks.append(translation)
                translation = ' '.join(translated_chunks)
            df.at[index, f'website_text_in_{choosen_lang}'] = translation
            print(f"Text at index {index} was translated.")
            
        if len(row['website_text']) <= 4900:
            translation = GoogleTranslator(source='auto', target=choosen_lang).translate(row['website_text'])
            df.at[index, f'website_text_in_{choosen_lang}'] = translation
            print(f"Text at index {index} was translated.")
df.rename(columns={'language': 'original_language'}, inplace=True)    
       

Text at index 3218 was translated.
Text at index 3219 was translated.
Text at index 3220 was translated.
Text at index 3221 was translated.
Text at index 3222 was translated.
Text at index 3223 was translated.
Text at index 3224 was translated.
Text at index 3226 was translated.
Text at index 3227 was translated.
Text at index 3228 was translated.
Text at index 3229 was translated.
Text at index 3230 was translated.
Text at index 3231 was translated.
Text at index 3232 was translated.
Text at index 3233 was translated.
Text at index 3234 was translated.
Text at index 3235 was translated.
Text at index 3236 was translated.
Text at index 3237 was translated.
Text at index 3238 was translated.
Text at index 3239 was translated.
Text at index 3240 was translated.
Text at index 3241 was translated.
Text at index 3242 was translated.
Text at index 3243 was translated.
Text at index 3244 was translated.
Text at index 3245 was translated.
Text at index 3246 was translated.
Text at index 3247 w

In [40]:
#for testing result while or after translation
df.head()

Unnamed: 0,name,original_idx,website_url,website_text,original_language,website_text_in_en
0,itravel,0,https://www.itravel.de/,itravel Telefonische Beratung +49 221 8282 888...,de,itravel Telephone advice +49 221 8282 8880 | S...
1,kunveno,2,https://kunveno.de,Kunveno - Work Happier WORK HAPPIER Booste Unt...,de,Kunveno - Work Happier WORK HAPPIER Boost cult...
3,Scopas,4,https://www.scopas.io/,ScopasWe're working on something new.Check it ...,en,ScopasWe're working on something new.Check it ...
4,studymaniac,5,https://studymaniac.de,Studymaniac - erfolgreich Studieren Open main ...,de,Studymaniac - study successfully Open main men...
6,Aicone,9,http://www.ai-c.one,Aicone - Artificial Intelligence Cloud One | A...,en,Aicone - Artificial Intelligence Cloud One | A...


In [42]:
#load the result from steps before
df = df_result_after_translation.copy()
df

#store result as json-file
df.to_json('../../data/preprocessed/translated_results_en.json')

# Data Cleaning

## Preprocesssing Functions from the exercise

In [81]:
#load data from transalted json and switch the transalted-Content as 'website_text'
df = pd.read_json('translated_results.json')

# Drop the "translatedContent" column
df.drop("Content", axis=1, inplace=True)

# Rename the "Content_in_de" column to replace the dropped column
df.rename(columns={"Content_in_de": "Content"}, inplace=True)

# Display the modified DataFrame
print(df)

     original_language                                            Content
0                   en  ANKOMMEN | Ihre LieblingsmarkenHolen Sie sich ...
1                   de  Rückabwicklung - von Lebensversicherungen Wir ...
2                   de     Starke Versicherungen für deinen Lifestyle ...
3                   en  RESTUBE - the airbag for more freedom and safe...
4                   de  CureVac - Wir revolutionieren die mRNA für das...
...                ...                                                ...
3275                de                      LEMONSGATE                ...
3276                de  Safily   Überspringen Suchen Suche schließen Z...
3277                de  Startseite | ICO-LUX   Fraud Prevention Über u...
3278                en  Versus | Alles vergleichen KategorienSmartphon...
3279                de  Selfstorage München and deutschlandweit | Stor...

[3258 rows x 2 columns]


In [74]:
# import packages
import re

# import pandas as pd
# import numpy as np

# import matplotlib.pyplot as plt
# import seaborn as sns

import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
nltk.download('stopwords')


from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB

# from skmultilearn.problem_transform import BinaryRelevance

[nltk_data] Downloading package stopwords to /Users/kathi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## <span style="color:green">To do: Determine a threshold to throw out other frequently occurring words </span>


In [84]:
# define functions

def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]', r'', sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]', r' ', cleaned)
    cleaned = re.sub(r'[äÄ]', 'ä', cleaned)
    cleaned = re.sub(r'[öÖ]', 'ö', cleaned)
    cleaned = re.sub(r'[üÜ]', 'ü', cleaned)
    cleaned = cleaned.strip()
    return cleaned

def keepAlpha(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-zA-ZäöüÄÖÜß]+', ' ', word, flags=re.UNICODE)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

def removeStopWords(sentence, stopwords):
    re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)   
    return re_stop_words.sub(" ", sentence)

def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence

# define stopwords
stop_words = set(stopwords.words(['german', 'english']))

# define stemmer
stemmer = SnowballStemmer('german')

df['website_text'] = df['website_text'].astype(str)
# apply functions
df['website_text'] = df['website_text'].str.lower()
df['website_text'] = df['website_text'].apply(cleanPunc)
df['website_text'] = df['website_text'].apply(keepAlpha)
df['website_text'] = df['website_text'].apply(removeStopWords, stopwords=stopwords)
df['website_text'] = df['website_text'].apply(stemming)

In [85]:
# check result
df.head(50)

Unnamed: 0,original_language,Content
0,en,ankomm lieblingsmarkenhol appuns visionpartner...
1,de,ruckabwickl lebensversicher hol doppelt leb re...
2,de,stark versicher lifestyl held de hom produkt f...
3,en,restub airbag freedom safety wat restub skip c...
4,de,curevac revolutioni mrna leb mensch fuhrungste...
5,en,hom open bank project inhalt spring produkt op...
6,en,viamon gmbh hom de hom technologi servic konta...
7,en,hivebuy unternehmensweit ord rechnung losung h...
8,de,startseit greenflash start mission referenz pr...
9,de,sit undergoing maintenanc doctorsgat wartungsm...


In [None]:
# function for tokenization -> only use when vectorizer doesn't include tokenization (probably included)
#def tokenize(text):
#    tokens = re.split(r'\W+', text)

#    return tokens

# applying function to the column
#df['website_text'] = df['website_text'].apply(lambda x: tokenize(x))
#df

In [43]:
# Specifiy the path to the folder where you want to save the CSV file
folder_path = '../data/preprocessed'

# Specify the file name and full path for the CSV file
file = 'preprocessed_data.csv'
path = os.path.join(folder_path, file)

# Save the preprocessed data to a CSV file
df.to_csv(path, index=False)

In [47]:
# generate requirements.txt without some weird paths instead of the package version
!pip list --format=freeze > requirements_preprocessing.txt