In [21]:
import psycopg2
from dotenv import load_dotenv
import os
from openai import OpenAI
import pandas as pd
from datetime import datetime
import pymysql

In [3]:
load_dotenv()

True

In [4]:
currentDateTime = datetime.now().strftime("%m-%d-%Y")

In [5]:
DB_HOST = os.environ.get("DB_HOST")
DB_USERNAME = os.environ.get("DB_USERNAME")
DB_PASSWORD = os.environ.get("DB_PASSWORD")
DB_PORT = os.environ.get("DB_PORT")
DB_NAME_BE = os.environ.get("DB_NAME_BE")

config = {
    'host': DB_HOST,
    'user': DB_USERNAME,
    'password': DB_PASSWORD,
    'database': DB_NAME_BE,
    'port': int(DB_PORT) 
}

conn = pymysql.connect(**config)

cursor = conn.cursor()

In [6]:
cursor.execute("SHOW TABLES")

results = cursor.fetchall()

for row in results:
    print(row)

('admins',)
('categories',)
('chatbots',)
('complaint_activities',)
('complaint_files',)
('complaint_likes',)
('complaint_processes',)
('complaints',)
('discussions',)
('faqs',)
('news',)
('news_comments',)
('news_files',)
('news_likes',)
('regencies',)
('users',)


In [7]:
cursor.execute("SELECT * FROM faqs")

results = cursor.fetchall()

for row in results:
    print(row)

(1, 'Apa itu KeluhProv?', 'KeluhProv adalah platform digital yang memungkinkan dan mempermudah warga banten untuk menyampaikan keluhan-keluhan mereka dengan tanggapan yang cepat oleh pemerintah setempat terkait Kesehatan, Pendidikan, Kependudukan, Keamanan, Infrastruktur, Lingkungan maupun Transportasi Daerah Banten.')
(2, 'Apa yang harus saya sertakan dalam aduan saya?', 'Saat menyampaikan aduan, harap sertakan informasi berikut: Foto aduan, lokasi kabupaten/kota, detail alamat, kategori aduan, tanggal kejadian, dan deskripsi aduan.')
(3, 'Berapa lama waktu yang dibutuhkan untuk mendapatkan tanggapan?', 'Kami berkomitmen untuk merespons keluhan Anda dalam waktu 1-2 hari pada jam kerja.')
(4, 'Berapa lama waktu yang dibutuhkan untuk aduan mendapatkan verifikasi?', 'Kami berkomitmen untuk merespons keluhan Anda dalam waktu 1-2 hari pada jam kerja.')
(5, 'Berapa lama waktu yang dibutuhkan untuk aduan mulai di proses ? ', 'Kami berkomitmen untuk melakukan proses terhadap keluhan Anda dala

In [None]:
DB_HOST_DE = os.environ.get("POSTGRES_HOST")
DB_USERNAME_DE = os.environ.get("POSTGRES_USER")
DB_PASSWORD_DE = os.environ.get("POSTGRES_PASSWORD")
DB_NAME_DE = os.environ.get("POSTGRES_DB")
DB_PORT_DE = os.environ.get("POSTGRES_PORT")

conn_data = psycopg2.connect(database = DB_NAME_DE, 
                        user = DB_USERNAME_DE, 
                        host= DB_HOST_DE,
                        password = DB_PASSWORD_DE,
                        port = int(DB_PORT_DE)
                        )

cursor_de = conn_data.cursor()

In [None]:
cursor_de.execute("DROP TABLE IF EXISTS complaint_facts CASCADE")
cursor_de.execute("DROP TABLE IF EXISTS admins CASCADE")

In [None]:
cursor_de.execute("""
    DO $$ 
    BEGIN
        IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'category_enum') THEN
            CREATE TYPE category_enum AS ENUM ('Kesehatan', 'Pendidikan', 'Kependudukan', 'Keamanan', 'Infrastruktur', 'Lingkungan', 'Transportasi');
        END IF;

        IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'status_enum') THEN
            CREATE TYPE status_enum AS ENUM ('verifikasi', 'on progress', 'selesai', 'ditolak');
        END IF;

        IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'type_enum') THEN
            CREATE TYPE type_enum AS ENUM ('private', 'public');
        END IF;
    END $$;
    
    CREATE TABLE IF NOT EXISTS users (
        id SERIAL PRIMARY KEY,
        name VARCHAR(255),
        username VARCHAR(255),
        email VARCHAR(255),
        telephone_number VARCHAR(20),
        password VARCHAR(255),
        profile_photo VARCHAR(255),
        created_at TIMESTAMP,
        updated_at TIMESTAMP,
        deleted_at TIMESTAMP NULL
    );
               
    CREATE TABLE IF NOT EXISTS admins (
        id SERIAL PRIMARY KEY,
        name VARCHAR(255),
        email VARCHAR(255),
        password VARCHAR(255),
        telephone_number VARCHAR(20),
        is_super_admin BOOLEAN,       
        profile_photo VARCHAR(255),
        created_at TIMESTAMP,
        updated_at TIMESTAMP,
        deleted_at TIMESTAMP NULL
    );

    CREATE TABLE IF NOT EXISTS complaints (
        id SERIAL PRIMARY KEY,
        category category_enum,
        description TEXT,
        regency VARCHAR(255),
        district VARCHAR(255),
        address VARCHAR(255),
        latitude VARCHAR(255),
        longitude VARCHAR(255),
        status status_enum,
        type type_enum,
        created_at TIMESTAMP,
        updated_at TIMESTAMP,
        deleted_at TIMESTAMP NULL
    );

    CREATE TABLE IF NOT EXISTS complaint_process (
        id SERIAL PRIMARY KEY,
        status status_enum,
        message TEXT,
        created_at TIMESTAMP,
        updated_at TIMESTAMP,
        deleted_at TIMESTAMP
    );

    CREATE TABLE IF NOT EXISTS complaint_facts (
        user_id BIGINT,
        complaint_id VARCHAR(255),
        complaints_process_id BIGINT,
        admin_id BIGINT,
        sum_complaint BIGINT,
        sum_verification BIGINT,
        sum_onprogress BIGINT,
        sum_resolved BIGINT,
        sum_rejected BIGINT,
        sum_deleted BIGINT,
        sum_public BIGINT,
        sum_private BIGINT,
        process_time BIGINT,
        user_rejected_complaints BIGINT,
        user_resolved_complaints BIGINT,
        last_complaints VARCHAR(255)
    );
""")

conn_data.commit()

In [None]:
cursor_de.execute("""SELECT table_name FROM information_schema.tables
       WHERE table_schema = 'public'""")
for table in cursor_de.fetchall():
    print(table)

('complaint_process',)
('complaints',)
('users',)
('admins',)
('complaint_facts',)


# EXTRACT DATA

In [10]:
cursor.execute("""
SELECT
    c.user_id,
    c.id AS complaint_id,
    cp.id AS complaints_process_id,
    a.id AS admin_id,
    (SELECT COUNT(id) FROM complaints) as total_complaints,
    (SELECT COUNT(status) FROM complaint_processes WHERE status = 'verifikasi') AS sum_verification,
    (SELECT COUNT(status) FROM complaint_processes WHERE status = 'on progress') AS sum_onprogress,
    (SELECT COUNT(status) FROM complaint_processes WHERE status = 'selesai') AS sum_resolved,
    (SELECT COUNT(status) FROM complaint_processes WHERE status = 'ditolak') AS sum_rejected,
    (SELECT COUNT(deleted_at) FROM complaints WHERE deleted_at IS NOT NULL) AS sum_deleted,
    (SELECT COUNT(type) FROM complaints WHERE type = 'public') AS sum_public,
    (SELECT COUNT(type) FROM complaints WHERE type = 'private') AS sum_private,
    DATEDIFF(
        (SELECT MAX(created_at) FROM complaint_processes WHERE status IN ('selesai', 'ditolak') AND complaint_id = c.id),
        (SELECT MAX(created_at) FROM complaint_processes WHERE status = 'verifikasi' AND complaint_id = c.id)
    ) AS process_time,
    COALESCE(u_rejected.count, 0) AS user_rejected_complaints,
    COALESCE(u_resolved.count, 0) AS user_resolved_complaints,
    lc.last_complaint_id AS last_complaints
    FROM
        complaints c
    JOIN
        complaint_processes cp ON c.id = cp.complaint_id
    JOIN
        users u ON c.user_id = u.id
    JOIN
        admins a ON cp.admin_id = a.id
    LEFT JOIN (
        SELECT user_id, COUNT(*) AS count
        FROM complaints
        WHERE status = 'ditolak'
        GROUP BY user_id
    ) u_rejected ON c.user_id = u_rejected.user_id
    LEFT JOIN (
        SELECT user_id, COUNT(*) AS count
        FROM complaints
        WHERE status = 'selesai'
        GROUP BY user_id
    ) u_resolved ON c.user_id = u_resolved.user_id
    LEFT JOIN (
        SELECT user_id, MAX(id) AS last_complaint_id
        FROM complaints
        GROUP BY user_id
    ) lc ON c.user_id = lc.user_id
    GROUP BY
        c.user_id, c.id, cp.id, a.id, u_rejected.count, u_resolved.count, lc.last_complaint_id
    ORDER BY
        user_id ASC;
""")

complaint_facts = cursor.fetchall()

complaint_facts = pd.DataFrame(complaint_facts, columns=['user_id',
                                      'complaint_id',
                                      'complaint_process_id',
                                      'admin_id',
                                      'total_complaints',
                                      'sum_verification',
                                      'sum_onprogress',
                                      'sum_resolved',
                                      'sum_rejected',
                                      'sum_deleted',
                                      'sum_public',
                                      'sum_private',
                                      'process_time',
                                      'user_rejected_complaints',
                                      'user_resolved_complaints',
                                      'last_complaints'])

complaint_facts.to_csv(f"complaint_facts/complaint_facts_{currentDateTime}.csv", index=False)

In [11]:
cursor.execute("""
SELECT
    id,
    name,
    email,
    telephone_number,
    is_super_admin,
    created_at,
    updated_at,
    deleted_at
FROM 
    admins
""")

admin_data = cursor.fetchall()

admin_data = pd.DataFrame(admin_data, columns=['id',
                                                'name',
                                                'email',
                                                'telephone_number',
                                                'is_super_admin',
                                                'created_at',
                                                'updated_at',
                                                'deleted_at'])

admin_data.to_csv(f"admins/admins_{currentDateTime}.csv", index=False)

In [12]:
cursor.execute("""
SELECT
    id,
    name,
    email,
    telephone_number,
    created_at,
    updated_at,
    deleted_at
FROM 
    users
""")

users_data = cursor.fetchall()

users_data = pd.DataFrame(users_data, columns=['id',
                                                'name',
                                                'email',
                                                'telephone_number',
                                                'created_at',
                                                'updated_at',
                                                'deleted_at'])

users_data.to_csv(f"users/users_{currentDateTime}.csv", index=False)

In [13]:
cursor.execute("""
SELECT
    id,
    description,
    address,
    type,
    total_likes,
    created_at,
    updated_at,
    deleted_at
FROM 
    complaints
""")

complaints_data = cursor.fetchall()

complaints_data = pd.DataFrame(complaints_data, columns=['id',
                                                    'description',
                                                    'address',
                                                    'type',
                                                    'total_likes',
                                                    'created_at',
                                                    'updated_at',
                                                    'deleted_at'])

complaints_data.to_csv(f"complaints/complaints_{currentDateTime}.csv", index=False)

# Transform Data

In [14]:
fact_table = pd.read_csv(f"complaint_facts/complaint_facts_{currentDateTime}.csv")

fact_table

Unnamed: 0,user_id,complaint_id,complaint_process_id,admin_id,total_complaints,sum_verification,sum_onprogress,sum_resolved,sum_rejected,sum_deleted,sum_public,sum_private,process_time,user_rejected_complaints,user_resolved_complaints,last_complaints
0,1,C-81j9aK9280,1,1,5,3,2,1,1,0,3,2,,0,1,C-8ksh&s9280
1,1,C-8ksh&s9280,2,1,5,3,2,1,1,0,3,2,0.0,0,1,C-8ksh&s9280
2,1,C-8ksh&s9280,3,2,5,3,2,1,1,0,3,2,0.0,0,1,C-8ksh&s9280
3,1,C-8ksh&s9280,4,2,5,3,2,1,1,0,3,2,0.0,0,1,C-8ksh&s9280
4,1,C-8ksh&s9280,5,2,5,3,2,1,1,0,3,2,0.0,0,1,C-8ksh&s9280
5,2,C-81jas92581,6,1,5,3,2,1,1,0,3,2,,0,0,C-81jas92581
6,2,C-81jas92581,7,3,5,3,2,1,1,0,3,2,,0,0,C-81jas92581
7,3,C-123j9ak280,11,1,5,3,2,1,1,0,3,2,,1,0,C-271j9ak280
8,3,C-123j9ak280,12,4,5,3,2,1,1,0,3,2,,1,0,C-271j9ak280
9,3,C-271j9ak280,8,1,5,3,2,1,1,0,3,2,,1,0,C-271j9ak280


In [29]:
fact_table.dtypes

fact_table.isnull().sum()

user_id                     0
complaint_id                0
complaint_process_id        0
admin_id                    0
total_complaints            0
sum_verification            0
sum_onprogress              0
sum_resolved                0
sum_rejected                0
sum_deleted                 0
sum_public                  0
sum_private                 0
process_time                0
user_rejected_complaints    0
user_resolved_complaints    0
last_complaints             0
dtype: int64

In [22]:
def complaint_id_format(complaint_id):
    if not complaint_id.startswith("C-"):
        return "C-" + complaint_id
    return complaint_id

fact_table['complaint_id'] = fact_table['complaint_id'].apply(complaint_id_format)

fact_table['process_time'] = fact_table['process_time'].fillna(0)

fact_table.to_csv(f"cleaned_data/cleaned_data_complaint_facts_{currentDateTime}.csv", index=False)

In [23]:
fact_table

Unnamed: 0,user_id,complaint_id,complaint_process_id,admin_id,total_complaints,sum_verification,sum_onprogress,sum_resolved,sum_rejected,sum_deleted,sum_public,sum_private,process_time,user_rejected_complaints,user_resolved_complaints,last_complaints
0,1,C-81j9aK9280,1,1,5,3,2,1,1,0,3,2,0.0,0,1,C-8ksh&s9280
1,1,C-8ksh&s9280,2,1,5,3,2,1,1,0,3,2,0.0,0,1,C-8ksh&s9280
2,1,C-8ksh&s9280,3,2,5,3,2,1,1,0,3,2,0.0,0,1,C-8ksh&s9280
3,1,C-8ksh&s9280,4,2,5,3,2,1,1,0,3,2,0.0,0,1,C-8ksh&s9280
4,1,C-8ksh&s9280,5,2,5,3,2,1,1,0,3,2,0.0,0,1,C-8ksh&s9280
5,2,C-81jas92581,6,1,5,3,2,1,1,0,3,2,0.0,0,0,C-81jas92581
6,2,C-81jas92581,7,3,5,3,2,1,1,0,3,2,0.0,0,0,C-81jas92581
7,3,C-123j9ak280,11,1,5,3,2,1,1,0,3,2,0.0,1,0,C-271j9ak280
8,3,C-123j9ak280,12,4,5,3,2,1,1,0,3,2,0.0,1,0,C-271j9ak280
9,3,C-271j9ak280,8,1,5,3,2,1,1,0,3,2,0.0,1,0,C-271j9ak280


In [37]:
komplain = pd.read_csv(f"complaints/complaints_{currentDateTime}.csv")

komplain

Unnamed: 0,id,description,address,type,total_likes,created_at,updated_at,deleted_at
0,C-123j9ak280,"Lorem ipsum dolor sit amet, consectetur adipis...","Jl. lorem ipsum No. 1 RT 01 RW 01, Kelurahan L...",public,0,2024-06-14 09:44:45.252,2024-06-14 09:44:45.252,
1,C-271j9ak280,"Lorem ipsum dolor sit amet, consectetur adipis...","Jl. lorem ipsum No. 1 RT 01 RW 01, Kelurahan L...",public,1,2024-06-14 09:44:45.252,2024-06-14 09:44:45.252,
2,C-81j9aK9280,"Lorem ipsum dolor sit amet, consectetur adipis...","Jl. lorem ipsum No. 1 RT 01 RW 01, Kelurahan L...",public,3,2024-06-14 09:44:45.252,2024-06-14 09:44:45.252,
3,C-81jas92581,"Lorem ipsum dolor sit amet, consectetur adipis...","Jl. lorem ipsum No. 1 RT 01 RW 01, Kelurahan L...",private,2,2024-06-14 09:44:45.252,2024-06-14 09:44:45.252,
4,C-8ksh&s9280,"Lorem ipsum dolor sit amet, consectetur adipis...","Jl. lorem ipsum No. 1 RT 01 RW 01, Kelurahan L...",private,2,2024-06-14 09:44:45.252,2024-06-14 13:12:58.929,


In [27]:
komplain.dtypes

komplain.isnull().sum()

id              object
description     object
address         object
type            object
total_likes      int64
created_at      object
updated_at      object
deleted_at     float64
dtype: object

In [60]:
def complaint_id_format(id):
    if not id.startswith("C-"):
        return "C-" + id
    return id

komplain['id'] = komplain['id'].apply(complaint_id_format)

cols = ['created_at', 'updated_at', 'deleted_at']

for col in cols:
    komplain[col] = pd.to_datetime(komplain[col]).dt.normalize()

komplain

komplain.to_csv(f"cleaned_data/cleaned_data_complaints_{currentDateTime}.csv", index=False)

In [59]:
komplain.dtypes

id                     object
description            object
address                object
type                   object
total_likes             int64
created_at     datetime64[ns]
updated_at     datetime64[ns]
deleted_at     datetime64[ns]
dtype: object

In [62]:
users = pd.read_csv(f"users/users_{currentDateTime}.csv")

users

Unnamed: 0,id,name,email,telephone_number,created_at,updated_at,deleted_at
0,1,userr1,user1@gmail.com,812121212,2024-06-09 08:25:59.037,2024-06-14 04:16:30.556,
1,2,User 2,user2@gmail.com,81234567890,2024-06-09 08:25:59.037,2024-06-09 08:25:59.037,
2,3,User 3,user3@gmail.com,81234567890,2024-06-09 08:25:59.037,2024-06-09 08:25:59.037,
3,4,dummy,dummycapstonekel8@gmail.com,8123123123123,2024-06-10 04:05:48.895,2024-06-10 04:07:24.644,
4,5,capst8,kelompok8capstone@gmail.com,12341234123,2024-06-10 09:13:24.480,2024-06-10 09:13:41.045,
5,6,asd,asd@gmail.com,1212,2024-06-11 08:06:08.147,2024-06-11 08:06:08.518,
6,7,asd,asdd@gmail.com,81231231,2024-06-12 03:07:45.132,2024-06-12 03:07:45.492,
7,8,asd,asddsa@gmail.com,123123,2024-06-12 03:47:33.416,2024-06-12 03:47:33.710,
8,9,dummy33,dummy33@gmail.com,12312321,2024-06-12 04:13:37.638,2024-06-12 04:13:38.017,
9,10,testingdummy,testingdummy@gmail.com,123123123,2024-06-12 04:45:47.650,2024-06-12 04:45:48.011,


In [81]:
cols = ['created_at', 'updated_at', 'deleted_at']

for col in cols:
    users[col] = pd.to_datetime(users[col]).dt.normalize()

users.to_csv(f"cleaned_data/cleaned_data_users_{currentDateTime}.csv", index=False)

users

Unnamed: 0,id,name,email,telephone_number,created_at,updated_at,deleted_at
0,1,userr1,user1@gmail.com,812121212,2024-06-09,2024-06-14,NaT
1,2,User 2,user2@gmail.com,81234567890,2024-06-09,2024-06-09,NaT
2,3,User 3,user3@gmail.com,81234567890,2024-06-09,2024-06-09,NaT
3,4,dummy,dummycapstonekel8@gmail.com,8123123123123,2024-06-10,2024-06-10,NaT
4,5,capst8,kelompok8capstone@gmail.com,12341234123,2024-06-10,2024-06-10,NaT
5,6,asd,asd@gmail.com,1212,2024-06-11,2024-06-11,NaT
6,7,asd,asdd@gmail.com,81231231,2024-06-12,2024-06-12,NaT
7,8,asd,asddsa@gmail.com,123123,2024-06-12,2024-06-12,NaT
8,9,dummy33,dummy33@gmail.com,12312321,2024-06-12,2024-06-12,NaT
9,10,testingdummy,testingdummy@gmail.com,123123123,2024-06-12,2024-06-12,NaT


In [80]:
users.dtypes

users.isnull().sum()


id                           int64
name                        object
email                       object
telephone_number             int64
created_at          datetime64[ns]
updated_at          datetime64[ns]
deleted_at          datetime64[ns]
dtype: object

In [83]:
admins = pd.read_csv(f"admins/admins_{currentDateTime}.csv")

admins

Unnamed: 0,id,name,email,telephone_number,is_super_admin,created_at,updated_at,deleted_at
0,1,Super Admin,super_admin@gmail.com,81234567890,1,2024-06-09 08:25:57.682,2024-06-09 08:25:57.682,
1,2,Admin Pandeglang,admin_pandeglang@gmail.com,81234567890,0,2024-06-09 08:25:57.682,2024-06-09 08:25:57.682,
2,3,Admin Lebak,admin_lebak@gmail.com,81234567890,0,2024-06-09 08:25:57.682,2024-06-09 08:25:57.682,
3,4,Admin Serang,admin_serang@gmail.com,81234567890,0,2024-06-09 08:25:57.682,2024-06-09 08:25:57.682,
4,5,admin coba,coba.admin@outlook.com,82155008990,0,2024-06-10 17:23:35.160,2024-06-10 17:24:24.996,
5,6,admin coba1,coba.admin1@outlook.com,82155008990,0,2024-06-10 17:24:50.927,2024-06-10 17:24:50.927,2024-06-10 17:25:01.398


In [84]:
admins.dtypes

id                   int64
name                object
email               object
telephone_number     int64
is_super_admin       int64
created_at          object
updated_at          object
deleted_at          object
dtype: object

In [87]:
cols = ['created_at', 'updated_at', 'deleted_at']

for col in cols:
    admins[col] = pd.to_datetime(admins[col]).dt.normalize()

admins.to_csv(f"cleaned_data/cleaned_data_admins_{currentDateTime}.csv", index=False)

admins

Unnamed: 0,id,name,email,telephone_number,is_super_admin,created_at,updated_at,deleted_at
0,1,Super Admin,super_admin@gmail.com,81234567890,1,2024-06-09,2024-06-09,NaT
1,2,Admin Pandeglang,admin_pandeglang@gmail.com,81234567890,0,2024-06-09,2024-06-09,NaT
2,3,Admin Lebak,admin_lebak@gmail.com,81234567890,0,2024-06-09,2024-06-09,NaT
3,4,Admin Serang,admin_serang@gmail.com,81234567890,0,2024-06-09,2024-06-09,NaT
4,5,admin coba,coba.admin@outlook.com,82155008990,0,2024-06-10,2024-06-10,NaT
5,6,admin coba1,coba.admin1@outlook.com,82155008990,0,2024-06-10,2024-06-10,2024-06-10


In [86]:
admins.dtypes

id                           int64
name                        object
email                       object
telephone_number             int64
is_super_admin               int64
created_at          datetime64[ns]
updated_at          datetime64[ns]
deleted_at          datetime64[ns]
dtype: object