# Exploration and Preprocessing Patrick

In [1]:
# Import relevant packages
import pandas as pd
import numpy as np
import os

## Data Loading

In [265]:
# List the folders containing the text files
folders = ['../data/raw/txt_files_0_1750', '../data/raw/txt_files_1751_3500', '../data/raw/txt_files_3501_5000']

# Initialize an empty list to store the data
data = []

# Iterate through each folder
for folder in folders:
    # Get the list of text files in the folder
    file_list = os.listdir(folder)
    
    # Iterate through each text file
    for file_name in file_list:
        # Read the contents of the file
        file_path = os.path.join(folder, file_name)
        try:
            with open(file_path, 'r') as file:
                content = file.read()
                # Append content to the data list
                data.append(content)
                
        except FileNotFoundError:
            print(f"File {file_name} not found.")
        except IOError:
            print(f"\033[91m Error reading the file {file_name}.\033[0m")
        
# Create a DataFrame from the data list
df = pd.DataFrame(data, columns=['Content'])
df

Unnamed: 0,Content
0,ARIVE | Your favorite brandsGet the appOur Vis...
1,Rückabwicklung - von Lebensversicherungen\nWir...
2,\nStarke Versicherungen für deinen Lifestyle...
3,\nRESTUBE - the airbag for more freedom and s...
4,CureVac - Wir revolutionieren die mRNA für das...
...,...
3275,\n LEMONSGATE ...
3276,Safily\n \n√úberspringen\nSuchen\nSuche schlie...
3277,Startseite | ICO-LUX\n \nFraud Prevention\nÜbe...
3278,Versus | Compare everything\nCategoriessmartph...


# Text translation

## Translation of the text via deep

In [271]:
from langdetect import detect

#count number of texts in different langauges
lang_counters = {}
for index, row in df.iterrows():
    if len(row['Content']) != 0 and index != 2700:
        language = detect(row['Content'])
        df.at[index, 'language'] = language
        if language in lang_counters:
            lang_counters[language] += 1
        else:
            lang_counters[language] = 1
    else: 
        print(f"\033[91m Attention: \033[0m At index {index} no text is found")

print(f"number of texts in different languages: {lang_counters}")

[91m Attention: [0m At index 77 no text is found
[91m Attention: [0m At index 134 no text is found
[91m Attention: [0m At index 279 no text is found
[91m Attention: [0m At index 323 no text is found
[91m Attention: [0m At index 397 no text is found
[91m Attention: [0m At index 423 no text is found
[91m Attention: [0m At index 625 no text is found
[91m Attention: [0m At index 851 no text is found
[91m Attention: [0m At index 952 no text is found
[91m Attention: [0m At index 1064 no text is found
[91m Attention: [0m At index 1269 no text is found
[91m Attention: [0m At index 1634 no text is found
[91m Attention: [0m At index 1698 no text is found
[91m Attention: [0m At index 1842 no text is found
[91m Attention: [0m At index 1992 no text is found
[91m Attention: [0m At index 2046 no text is found
[91m Attention: [0m At index 2438 no text is found
[91m Attention: [0m At index 2528 no text is found
[91m Attention: [0m At index 2633 no text is found
[9

In [272]:
df.head(20)

Unnamed: 0,Content,language
0,ANKOMMEN | Ihre LieblingsmarkenHolen Sie sich ...,de
1,Rückabwicklung - von Lebensversicherungen\nWir...,de
2,\nStarke Versicherungen für deinen Lifestyle...,de
3,RESTUBE - the airbag for more freedom and safe...,de
4,CureVac - Wir revolutionieren die mRNA für das...,de
5,Startseite – Open-Bank-Projekt\n \n \nZum Inha...,de
6,viamon GmbH :: Startseite\nDE\nHeim\nTechnolog...,de
7,Hivebuy: Ihre unternehmensweite Order-to-Rechn...,de
8,Startseite – Greenflash\nStart\nÜber uns\nUnse...,de
9,Site is undergoing maintenance \n \n \nDoctors...,de


In [273]:
from deep_translator import GoogleTranslator

#take the language in which more texts are written
max_lang_count = max(lang_counters, key=lang_counters.get)
print(f"The most of the texts are written in '{max_lang_count}")

#translate all texts not written in the choosen language 

for index, row in df.iterrows():
    translated_chunks=[]
    if row['language'] != max_lang_count:
        if len(row['Content']) > 4900:
            # Split the text into smaller chunks for translation
            chunk_size = 4900  # Choose chank size
            chunks = [row['Content'][i:i+chunk_size] for i in range(0, len(row['Content']), chunk_size)]

            # Translate each chunk and join them back
            translated_chunks = []
            for chunk in chunks:
                translation = GoogleTranslator(source='auto', target=max_lang_count).translate(chunk)
                translated_chunks.append(translation)
            translation = ' '.join(translated_chunks)
        else:
            translation = GoogleTranslator(source='auto', target=max_lang_count).translate(row['Content'])
        df.at[index, 'Content'] = translation
        print(f"Text at index {index} was translated.")

df.rename(columns={'language': 'original_language'}, inplace=True)    
       

The most of the texts are written in 'de
Text at index 17 was translated.
Text at index 50 was translated.
Text at index 77 was translated.
Text at index 134 was translated.
Text at index 202 was translated.
Text at index 215 was translated.
Text at index 252 was translated.
Text at index 279 was translated.
Text at index 284 was translated.
Text at index 323 was translated.
Text at index 334 was translated.
Text at index 350 was translated.
Text at index 397 was translated.
Text at index 409 was translated.
Text at index 412 was translated.
Text at index 423 was translated.
Text at index 446 was translated.
Text at index 464 was translated.
Text at index 508 was translated.
Text at index 521 was translated.
Text at index 615 was translated.
Text at index 625 was translated.
Text at index 636 was translated.
Text at index 638 was translated.
Text at index 652 was translated.
Text at index 734 was translated.
Text at index 762 was translated.
Text at index 790 was translated.
Text at in

In [None]:
df.head()

# Data Cleaning

## Preprocesssing Functions from the exercise

In [211]:
# import packages
import re

# import pandas as pd
# import numpy as np

# import matplotlib.pyplot as plt
# import seaborn as sns

import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer


from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB

# from skmultilearn.problem_transform import BinaryRelevance

In [214]:
# define functions
def cleanHtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext

def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'', sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ', cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned

def keepAlpha(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

def removeStopWords(sentence, stopwords):
    re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)   
    return re_stop_words.sub(" ", sentence)

def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence

# define stopwords
stop_words = set(stopwords.words(['german', 'english']))

# define stemmer
stemmer = SnowballStemmer('german', 'english')

# apply functions
df['Content'] = df['Content'].str.lower()
df['Content'] = df['Content'].apply(cleanHtml)
df['Content'] = df['Content'].apply(cleanPunc)
df['Content'] = df['Content'].apply(keepAlpha)
df['Content'] = df['Content'].apply(removeStopWords, stopwords=stopwords)
df['Content'] = df['Content'].apply(stemming)

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - '/Users/kathi/nltk_data'
    - '/Users/kathi/VSC-projects/bda-startup-clustering/bda_cluster_venv/nltk_data'
    - '/Users/kathi/VSC-projects/bda-startup-clustering/bda_cluster_venv/share/nltk_data'
    - '/Users/kathi/VSC-projects/bda-startup-clustering/bda_cluster_venv/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
import deepl
from langdetect import detect

#count number of texts in different langauges
counters = {}
for index, row in df.iterrows():
    language = detect(row['Content'])
    if language in counters:
        counters[language] += 1
    else:
        counters[language] = 1

    print(f"number of texts in diffrent languages: {counters}")


#take the language in which more texts are written


#translate all texts not written in the choosen language 


In [41]:
# check result
df.head()

Unnamed: 0,Content
0,ariv favorit brandsget appour visionpartnershi...
1,r ckabwickl lebensversicher hol doppelt leb re...
2,stark versicher f r lifestyl held de hom produ...
3,restub airbag freedom safety wat restub skip c...
4,curevac revolutioni mrna f r leb mensch ber be...


In [None]:
# function for tokenization -> only use when vectorizer doesn't include tokenization (probably included)
#def tokenize(text):
#    tokens = re.split(r'\W+', text)

#    return tokens

# applying function to the column
#df['Content'] = df['Content'].apply(lambda x: tokenize(x))
#df

In [43]:
# Specifiy the path to the folder where you want to save the CSV file
folder_path = '../data/preprocessed'

# Specify the file name and full path for the CSV file
file = 'preprocessed_data.csv'
path = os.path.join(folder_path, file)

# Save the preprocessed data to a CSV file
df.to_csv(path, index=False)

In [47]:
# generate requirements.txt without some weird paths instead of the package version
!pip list --format=freeze > requirements_preprocessing.txt