# 探索性数据分析

本次实验要求我们利用给定的8000个英文名字，训练一个循环神经网络，完成对于任意给定的名字前缀生成（补全）给定的名字。为了更好的完成我们的任务，我们应首先考虑对数据集进行必要的分析：

## Total Dataset items

In [9]:
import os
from typing import List, Tuple, Dict

def build_dataset(files_list=['data/female.txt',
                              'data/male.txt']) -> List[str]:
    dataset = []
    for file_path in files_list:
        with open(file_path) as file:
            for line in file.readlines():
                if not line.startswith('# ') and len(line.strip()) > 0:
                    dataset.append(line.strip().lower())
    return dataset

nameSet = build_dataset()
print(len(nameSet))

## show the most common names & their frequence. 

In [19]:
from nltk.probability import FreqDist

fdist = FreqDist(nameSet)
tops=fdist.most_common(50)


print(tops)

[('Gale', 3), ('Abbey', 2), ('Abbie', 2), ('Abby', 2), ('Addie', 2), ('Adrian', 2), ('Adrien', 2), ('Ajay', 2), ('Alex', 2), ('Alexis', 2), ('Alfie', 2), ('Ali', 2), ('Alix', 2), ('Allie', 2), ('Allyn', 2), ('Andie', 2), ('Andrea', 2), ('Andy', 2), ('Angel', 2), ('Angie', 2), ('Ariel', 2), ('Ashley', 2), ('Aubrey', 2), ('Augustine', 2), ('Austin', 2), ('Averil', 2), ('Barrie', 2), ('Barry', 2), ('Beau', 2), ('Bennie', 2), ('Benny', 2), ('Bernie', 2), ('Bert', 2), ('Bertie', 2), ('Bill', 2), ('Billie', 2), ('Billy', 2), ('Blair', 2), ('Blake', 2), ('Bo', 2), ('Bobbie', 2), ('Bobby', 2), ('Brandy', 2), ('Brett', 2), ('Britt', 2), ('Brook', 2), ('Brooke', 2), ('Brooks', 2), ('Bryn', 2), ('Cal', 2)]


## Show the character set 

This Part aims at making sure the size of our character set & show them.

In [43]:
character_set = set()
for word in nameSet:
    if word.find(" ") != -1 or word.find("'") != -1 or word.find("-") != -1:
        print(word)
    character_set.update(word)
print(character_set)

Ann-Mari
Ann-Marie
Anna-Diana
Anna-Diane
Anna-Maria
Anne-Corinne
Anne-Mar
Anne-Marie
Barbara-Anne
Bette-Ann
Carol-Jean
Dee Dee
Diane-Marie
E'Lane
Helen-Elizabeth
Holly-Anne
Jo Ann
Jo-Ann
Jo-Anne
Kara-Lynn
Marie-Ann
Marie-Jeanne
Paula-Grace
Sara-Ann
Sheila-Kathryn
Sue-elle
Terri-Jo
Theresa-Marie
Zsa Zsa
Hans-Peter
Jean-Christophe
Jean-Francois
Jean-Lou
Jean-Luc
Jean-Marc
Jean-Paul
Jean-Pierre
John-David
John-Patrick
{'D', 'M', 'L', 'A', ' ', 'J', 'W', 'c', 'n', 'K', 'b', 'j', 'U', 'P', 'l', 'z', 'w', 'x', 'o', 'N', 'r', 'y', 'Z', 'B', 'e', 'Q', 'T', 'I', 'a', 'p', 'R', 'k', 'Y', 'v', 'E', 'i', 'u', "'", 'F', '-', 's', 'q', 'O', 'S', 'g', 'd', 'C', 't', 'V', 'X', 'm', 'h', 'f', 'G', 'H'}


In [28]:
print(sorted(character_set))

[' ', "'", '-', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


## Show the longgest name

In [33]:
word_length = [(len(word), word) for word in nameSet]

In [41]:
sorted(word_length, reverse=True)[0][0]

15