In [7]:
import pandas as pd
from hdfs import InsecureClient
from sklearn.preprocessing import MinMaxScaler

# === 1. Загрузка датасета ===
file_path = "file:///home/jovyan/kt3/World Important Dates.csv"
df = pd.read_csv(file_path)

print("Исходные данные:")
print(df.head())

# === 2. Нормализация данных ===
# Найдём числовые столбцы
numeric_cols = df.select_dtypes(include=['number']).columns

if len(numeric_cols) > 0:
    scaler = MinMaxScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    print("\nНормализованные данные:")
    print(df.head())
else:
    print("\nНет числовых столбцов для нормализации — пропускаем шаг.")

# === 3. Сохранение в Parquet ===
parquet_file = "World_Important_Dates.parquet"
df.to_parquet(parquet_file, index=False)
print(f"\nФайл сохранён в {parquet_file}")

# === 4. Загрузка в HDFS ===
# Укажи свой адрес NameNode (пример: 'http://localhost:9870')
hdfs_url = "http://namenode:9870"
client = InsecureClient(hdfs_url, user='root')  # user = имя пользователя Hadoop

hdfs_path = '/user/hadoop/World_Important_Dates.parquet'

with open(parquet_file, 'rb') as f:
    client.write(hdfs_path, f, overwrite=True)

print(f"\nФайл успешно загружен в HDFS по пути {hdfs_path}")


Исходные данные:
   Sl. No                      Name of Incident     Date    Month     Year  \
0       1  Indus Valley Civilization Flourishes  Unknown  Unknown  2600 BC   
1       2               Battle of the Ten Kings  Unknown  Unknown  1400 BC   
2       6  Establishment of the Delhi Sultanate  Unknown  Unknown     1206   
3       7                     Battle of Panipat       21    April     1526   
4       8          Establishment of British Raj        1      May     1858   

  Country Type of Event    Place Name  \
0   India  Civilization  Indus Valley   
1   India        Battle        Punjab   
2   India     Political         Delhi   
3   India        Battle       Panipat   
4   India      Colonial   Whole India   

                                              Impact  \
0  Development of one of the world's earliest urb...   
1  Rigvedic tribes consolidated their control ove...   
2          Muslim rule established in parts of India   
3           Foundation of the Mughal Empire

In [2]:
!pip install hdfs

Collecting hdfs
  Downloading hdfs-2.7.3.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m245.2 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting docopt (from hdfs)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: hdfs, docopt
  Building wheel for hdfs (setup.py) ... [?25ldone
[?25h  Created wheel for hdfs: filename=hdfs-2.7.3-py3-none-any.whl size=34325 sha256=2a72345c2f17d01156f389723fb9a8545ba55b497a5dde6eb9e7875e9e5cd38c
  Stored in directory: /home/jovyan/.cache/pip/wheels/b9/1d/dc/eb0833be25464c359903d356c4204721c6a672c26ff164cdc3
  Building wheel for docopt (setup.py) ... [?25ldone
[?25h  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13705 sha256=c4599faee1b560fe36c9cea5fa3ee30415e4ced664a210f837015d9ee30c7bfd
  Stored in directory: /home/jovyan/.cache