## 1. Import Dataset

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv('Netflix Userbase.csv')

In [None]:
data.head()

Unnamed: 0,User ID,Subscription Type,Monthly Revenue,Join Date,Last Payment Date,Country,Age,Gender,Device,Plan Duration
0,1,Basic,10,15-01-22,10-06-23,United States,28,Male,Smartphone,1 Month
1,2,Premium,15,05-09-21,22-06-23,Canada,35,Female,Tablet,1 Month
2,3,Standard,12,28-02-23,27-06-23,United Kingdom,42,Male,Smart TV,1 Month
3,4,Standard,12,10-07-22,26-06-23,Australia,51,Female,Laptop,1 Month
4,5,Basic,10,01-05-23,28-06-23,Germany,33,Male,Smartphone,1 Month


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   User ID            2500 non-null   int64 
 1   Subscription Type  2500 non-null   object
 2   Monthly Revenue    2500 non-null   int64 
 3   Join Date          2500 non-null   object
 4   Last Payment Date  2500 non-null   object
 5   Country            2500 non-null   object
 6   Age                2500 non-null   int64 
 7   Gender             2500 non-null   object
 8   Device             2500 non-null   object
 9   Plan Duration      2500 non-null   object
dtypes: int64(3), object(7)
memory usage: 195.4+ KB


## 2. Data Cleaning

In [None]:
data = data.drop(['User ID', 'Subscription Type', 'Monthly Revenue', 'Join Date', 'Last Payment Date', 'Age', 'Gender', 'Plan Duration'], axis=1)

In [None]:
data.head()

Unnamed: 0,Country,Device
0,United States,Smartphone
1,Canada,Tablet
2,United Kingdom,Smart TV
3,Australia,Laptop
4,Germany,Smartphone


## 3. Attribute Augmentation

In [None]:
from geopy.geocoders import Nominatim
import pandas as pd
from geopy.exc import GeocoderTimedOut
from concurrent.futures import ThreadPoolExecutor, as_completed

# Inisialisasi geolocator
geolocator = Nominatim(user_agent="lowki", timeout=10)

# Fungsi untuk mendapatkan koordinat
def get_coordinates(country):
    try:
        location = geolocator.geocode(country)
        if location:
            return (country, location.latitude, location.longitude)
        else:
            return (country, None, None)
    except GeocoderTimedOut:
        return get_coordinates(country)

# Fungsi untuk mempercepat proses dengan ThreadPoolExecutor
def geocode_countries(countries):
    coordinates = []
    with ThreadPoolExecutor(max_workers=5) as executor:
        future_to_country = {executor.submit(get_coordinates, country): country for country in countries}
        for future in as_completed(future_to_country):
            country = future_to_country[future]
            try:
                data = future.result()
                coordinates.append(data)
            except Exception as exc:
                print(f'{country} generated an exception: {exc}')
    return coordinates

# Dapatkan koordinat untuk semua negara dalam dataframe
countries = data['Country'].tolist()
results = geocode_countries(countries)

# Tambahkan hasil ke dataframe
for country, lat, lon in results:
    data.loc[data['Country'] == country, 'Latitude'] = lat
    data.loc[data['Country'] == country, 'Longitude'] = lon

print(data)


             Country      Device   Latitude   Longitude
0      United States  Smartphone  39.783730 -100.445882
1             Canada      Tablet  61.066692 -107.991707
2     United Kingdom    Smart TV  54.702354   -3.276575
3          Australia      Laptop -24.776109  134.755000
4            Germany  Smartphone  51.163818   10.447831
...              ...         ...        ...         ...
2495           Spain    Smart TV  39.326068   -4.837979
2496           Spain    Smart TV  39.326068   -4.837979
2497   United States      Laptop  39.783730 -100.445882
2498          Canada      Tablet  61.066692 -107.991707
2499   United States    Smart TV  39.783730 -100.445882

[2500 rows x 4 columns]


In [None]:
from geopy.geocoders import Nominatim
import pandas as pd
from geopy.exc import GeocoderTimedOut

geolocator = Nominatim(user_agent="lowki", timeout=10)

def get_coordinates(country):
    try:
        location = geolocator.geocode(country)
        if location:
            return location.latitude, location.longitude
        else:
            return None, None
    except GeocoderTimedOut:
        return get_coordinates(country)

data['Latitude'] = None
data['Longitude'] = None

for i, country in enumerate(data['Country']):
    lat, lon = get_coordinates(country)
    if lat is not None and lon is not None:
        data.at[i, 'Latitude'] = lat
        data.at[i, 'Longitude'] = lon

print(data)

             Country      Device   Latitude   Longitude
0      United States  Smartphone   39.78373 -100.445882
1             Canada      Tablet  61.066692 -107.991707
2     United Kingdom    Smart TV  54.702354   -3.276575
3          Australia      Laptop -24.776109     134.755
4            Germany  Smartphone  51.163818   10.447831
...              ...         ...        ...         ...
2495           Spain    Smart TV  39.326068   -4.837979
2496           Spain    Smart TV  39.326068   -4.837979
2497   United States      Laptop   39.78373 -100.445882
2498          Canada      Tablet  61.066692 -107.991707
2499   United States    Smart TV   39.78373 -100.445882

[2500 rows x 4 columns]


In [None]:
data.head()

Unnamed: 0,Country,Device,Latitude,Longitude
0,United States,Smartphone,39.78373,-100.445882
1,Canada,Tablet,61.066692,-107.991707
2,United Kingdom,Smart TV,54.702354,-3.276575
3,Australia,Laptop,-24.776109,134.755
4,Germany,Smartphone,51.163818,10.447831


In [None]:
print(data)

             Country      Device   Latitude   Longitude
0      United States  Smartphone  39.783730 -100.445882
1             Canada      Tablet  61.066692 -107.991707
2     United Kingdom    Smart TV  54.702354   -3.276575
3          Australia      Laptop -24.776109  134.755000
4            Germany  Smartphone  51.163818   10.447831
...              ...         ...        ...         ...
2495           Spain    Smart TV  39.326068   -4.837979
2496           Spain    Smart TV  39.326068   -4.837979
2497   United States      Laptop  39.783730 -100.445882
2498          Canada      Tablet  61.066692 -107.991707
2499   United States    Smart TV  39.783730 -100.445882

[2500 rows x 4 columns]


## 4. Duplicate Detection

In [None]:
duplicate_count = data.duplicated().sum()
print(duplicate_count)

2460


In [None]:
data.loc[data['Country'] == 'United States', 'Country'] = 'United States of America'
data['Country'] = data['Country'].str.title().replace(' Of ', ' of ', regex=True)

In [None]:
# Fungsi untuk mendapatkan koordinat acak dalam batas wilayah negara menggunakan GeoJSON dengan caching
class RandomCoordinateGenerator:
    def __init__(self, geojson_file='world.json'):
        self.world = gpd.read_file(geojson_file)
        self.bounding_boxes = {}

    def get_random_coordinates(self, country):
        if country not in self.bounding_boxes:
            country_geom = self.world[self.world['name'] == country]['geometry']
            if country_geom.empty:
                raise ValueError(f"Country '{country}' not found in the GeoJSON file.")
            self.bounding_boxes[country] = country_geom.values[0].bounds

        minx, miny, maxx, maxy = self.bounding_boxes[country]
        country_geom = self.world[self.world['name'] == country]['geometry'].values[0]

        while True:
            random_point = Point(np.random.uniform(minx, maxx), np.random.uniform(miny, maxy))
            if country_geom.contains(random_point):
                return random_point.y, random_point.x

In [None]:
# Inisialisasi generator koordinat acak
generator = RandomCoordinateGenerator()

iteration_limit = 10
iterations = 0

while iterations < iteration_limit:
    duplicate_mask = data.duplicated(['Device', 'Latitude', 'Longitude'])
    duplicate_count = duplicate_mask.sum()

    if duplicate_count == 0:
        break

    duplicate_indices = data.index[duplicate_mask]

    for index in duplicate_indices:
        country = data.loc[index, 'Country']
        try:
            new_latitude, new_longitude = generator.get_random_coordinates(country)
            data.loc[index, 'Latitude'] = new_latitude
            data.loc[index, 'Longitude'] = new_longitude
        except ValueError as e:
            print(e)

    iterations += 1

print("Tidak ada data yang duplikat atau sama persis lagi.")
print(data)

In [None]:
data.head()

## 5. Transformasi Data

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
data['Device'] = label_encoder.fit_transform(data['Device'])

print("Data hasil transformation", data['Device'].unique())

In [None]:
data.head()

In [None]:
duplicate_count = data.duplicated().sum()
print(duplicate_count)

2460


In [None]:
data.to_csv('Netflix Dataset', index=False)