# Импорт библиотек

In [1]:
from pyspark.sql import functions as f
from pyspark.sql import types as t

# Создание сессии Spark

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('simple_app').master('local[*]').getOrCreate()

# Работа с данными

In [4]:
df = spark.read.format('csv').option('inferSchema', 'true').option('header', 'true').load('../data/clients.csv')

In [5]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- fullname: string (nullable = true)
 |-- address: string (nullable = true)
 |-- phone_number: string (nullable = true)
 |-- email: string (nullable = true)
 |-- workplace: string (nullable = true)
 |-- birthdate: timestamp (nullable = true)
 |-- registration_date: timestamp (nullable = true)
 |-- gender: string (nullable = true)
 |-- income: integer (nullable = true)
 |-- expenses: integer (nullable = true)
 |-- credit: integer (nullable = true)
 |-- deposit: integer (nullable = true)



In [6]:
df.show(20)

+---+--------------------+--------------------+------------------+--------------------+--------------------+-------------------+-------------------+------+------+--------+------+-------+
| id|            fullname|             address|      phone_number|               email|           workplace|          birthdate|  registration_date|gender|income|expenses|credit|deposit|
+---+--------------------+--------------------+------------------+--------------------+--------------------+-------------------+-------------------+------+------+--------+------+-------+
|  1|Волкова Фаина Афа...|с. Новосибирск, н...|  8 (475) 014-6494|wkirillov@rambler.ru|ст. Кировск (Мурм...|1966-09-07 00:00:00|2017-01-10 00:00:00|     F|230910|   61975|     1|   null|
|  2|Пелагея Вячеславо...|клх Белокуриха, ш...| +7 (981) 576-1146|bogdanovmoke@ramb...|г. Елабуга, бул. ...|1987-05-23 00:00:00|2016-07-14 00:00:00|     F|  null|   75232|  null|   null|
|  3|Валентина Игоревн...|к. Усть-Катав, бу...|+7 (920) 068-58-36

In [7]:
df.count()

1000

In [128]:
import pandas as pd
import numpy as np

In [129]:
df = pd.read_csv('../data/transactions.csv')

In [130]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,929135.0,464567.0,268218.315518,0.0,232283.5,464567.0,696850.5,929134.0
client_id,929135.0,500.573223,288.664178,1.0,251.0,501.0,750.0,1000.0
product_category,929135.0,9.348996,7.495062,1.0,5.0,6.0,11.0,29.0
amount,929135.0,1143.750824,3948.343863,50.0,83.0,347.0,1042.0,149891.0


In [131]:
df.head(1)

Unnamed: 0,id,client_id,product_category,product_company,subtype,amount,date,transaction_type
0,34478,37,29,,Автоплатеж,1209,2020-01-01 00:00:00,Negative


In [132]:
df.loc[:, 'date'] = pd.to_datetime(df.loc[:, 'date'])

In [133]:
def make_fake_df(df, shift, max_id):

    df_fake = df.copy()
    df_fake.loc[:, 'date'] += pd.offsets.DateOffset(years=shift)
    df_fake.loc[:, 'id'] += df['id'].max() + 1
    df_fake.loc[:, 'amount'] *= np.random.choice(range(11, 20, 1))/10.

    df_fake.loc[:, 'client_id'] = np.where(df_fake['client_id'] < 100, df_fake['client_id'] + np.random.choice(range(1, 10, 1)), df['client_id'])
    df_fake.loc[:, 'client_id'] = np.where(df_fake['client_id'] > 900, df_fake['client_id'] - np.random.choice(range(1, 10, 1)), df['client_id'])
    df_fake.loc[:, 'client_id'] = np.where((df_fake['client_id'] >= 100) & (df_fake['client_id'] < 500) , df_fake['client_id'] + np.random.choice(range(1, 10, 1)), df['client_id'])
    df_fake.loc[:, 'client_id'] = np.where((df_fake['client_id'] >= 500) & (df_fake['client_id'] <= 900) , df_fake['client_id'] - np.random.choice(range(1, 10, 1)), df['client_id'])

    return df_fake

In [134]:
df.loc[:, 'date'] -= pd.offsets.DateOffset(years=10)

In [135]:
df.head(5)

Unnamed: 0,id,client_id,product_category,product_company,subtype,amount,date,transaction_type
0,34478,37,29,,Автоплатеж,1209,2010-01-01,Negative
1,65406,71,4,YouTube Music,Подписка,169,2010-01-01,Negative
2,82186,88,29,,Автоплатеж,3711,2010-01-01,Negative
3,83113,89,29,,Автоплатеж,6443,2010-01-01,Negative
4,131373,141,29,,Автоплатеж,7319,2010-01-01,Negative


In [136]:
df_raw = df.copy()

for i in range(1, 13):

    max_id = df['id'].max()
    df_fake = make_fake_df(df_raw, i, max_id)
    df = pd.concat(objs=[df, df_fake])

df.reset_index(drop=True)

Unnamed: 0,id,client_id,product_category,product_company,subtype,amount,date,transaction_type
0,34478,37,29,,Автоплатеж,1209.0,2010-01-01 00:00:00,Negative
1,65406,71,4,YouTube Music,Подписка,169.0,2010-01-01 00:00:00,Negative
2,82186,88,29,,Автоплатеж,3711.0,2010-01-01 00:00:00,Negative
3,83113,89,29,,Автоплатеж,6443.0,2010-01-01 00:00:00,Negative
4,131373,141,29,,Автоплатеж,7319.0,2010-01-01 00:00:00,Negative
...,...,...,...,...,...,...,...,...
12078750,940772,13,6,Стрелка,Покупка,90.0,2022-12-31 23:58:04,Negative
12078751,1108473,194,6,,Покупка,97.2,2022-12-31 23:58:11,Negative
12078752,1440379,546,3,Ситимобил,Покупка,563.4,2022-12-31 23:58:13,Negative
12078753,1143928,232,6,,Покупка,156.6,2022-12-31 23:59:03,Negative


In [137]:
df['date'].max()

Timestamp('2022-12-31 23:59:12')

In [138]:
df.to_csv('../data/transactions_full.csv', index=None)