In [8]:
import pandas as pd
import os
import re
import string
import nltk
from tqdm import tqdm
from nltk.corpus import stopwords
import pandas as pd

### Читаем данные из .txt файлов в pandas.DataFrame

In [4]:
def load_data_from_dir(directory):
    texts = []
    labels = []
    ratings = []
    for label in ['pos', 'neg']:
        dir_path = os.path.join(directory, label)
        for filename in os.listdir(dir_path):
            if filename.endswith('.txt'):
                with open(os.path.join(dir_path, filename), 'r', encoding='utf-8') as f:
                    filename = filename.split('.')[0] # Remove the .txt
                    texts.append(f.read())
                    labels.append(1 if label == 'pos' else 0)
                    ratings.append(int(filename.split('_')[1]))
    return pd.DataFrame({'text': texts, 'label': labels, 'ratings': ratings })

In [5]:
train_data = load_data_from_dir("../datasets/train")
test_data = load_data_from_dir("../datasets/test")

In [6]:
train_data.head()

Unnamed: 0,text,label,ratings
0,Bromwell High is a cartoon comedy. It ran at t...,1,9
1,Homelessness (or Houselessness as George Carli...,1,8
2,Brilliant over-acting by Lesley Ann Warren. Be...,1,10
3,This is easily the most underrated film inn th...,1,7
4,This is not the typical Mel Brooks film. It wa...,1,8


In [7]:
test_data.head()

Unnamed: 0,text,label,ratings
0,I went and saw this movie last night after bei...,1,10
1,Actor turned director Bill Paxton follows up h...,1,7
2,As a recreational golfer with some knowledge o...,1,9
3,"I saw this film in a sneak preview, and it is ...",1,8
4,Bill Paxton has taken the true story of the 19...,1,8


### Преобразуем текст

In [9]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hwndr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hwndr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
def preprocess_text(text):
    # Приведение к нижнему регистру
    text = text.lower()
    # Удаление пунктуации
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Удаление цифр
    text = re.sub(r'\d+', '', text)
    # Токенизация
    tokens = nltk.word_tokenize(text)
    # Удаление стоп-слов
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Лемматизация
    lemmatizer = nltk.WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Объединение токенов обратно в строку
    processed_text = ' '.join(tokens)
    
    return processed_text

In [11]:
tqdm.pandas(desc="Предобработка комментариев")

train_data['clean_text'] = train_data['text'].progress_apply(preprocess_text)
test_data['clean_text'] = test_data['text'].progress_apply(preprocess_text)

Предобработка комментариев: 100%|██████████| 25000/25000 [00:27<00:00, 916.97it/s] 
Предобработка комментариев: 100%|██████████| 25000/25000 [00:25<00:00, 968.93it/s] 


### Сохраняем в csv

In [12]:
train_data.to_csv("../datasets/train_clean_data.csv",index=False)
test_data.to_csv("../datasets/test_clean_data.csv",index=False)